Browse Source

vanuatu null hypothesis expected correlations

Lucas Gautheron 2 years ago
parent
commit
91a0135a89
2 changed files with 429 additions and 0 deletions
  1. 285 0
      code/models/vanuatu_null_hyp.py
  2. 144 0
      code/plots/vanuatu_null_corr.py

+ 285 - 0
code/models/vanuatu_null_hyp.py

@@ -0,0 +1,285 @@
+#!/usr/bin/env python3
+
+from ChildProject.projects import ChildProject
+from ChildProject.annotations import AnnotationManager
+from ChildProject.metrics import segments_to_annotation
+
+import argparse
+
+import datalad.api
+from os.path import join as opj
+from os.path import basename, exists
+
+import multiprocessing as mp
+
+import numpy as np
+import pandas as pd
+import pickle
+from pyannote.core import Annotation, Segment, Timeline
+
+import stan
+
+parser = argparse.ArgumentParser(description = 'main model described throughout the notes.')
+parser.add_argument('--group', default = 'child', choices = ['corpus', 'child'])
+parser.add_argument('--chains', default = 4, type = int)
+parser.add_argument('--samples', default = 2000, type = int)
+parser.add_argument('--validation', default = 0, type = float)
+parser.add_argument('--output', default = 'model3')
+args = parser.parse_args()
+
+def extrude(self, removed, mode: str = 'intersection'):
+    if isinstance(removed, Segment):
+        removed = Timeline([removed])
+
+    truncating_support = removed.gaps(support=self.extent())
+    # loose for truncate means strict for crop and vice-versa
+    if mode == "loose":
+        mode = "strict"
+    elif mode == "strict":
+        mode = "loose"
+    
+    return self.crop(truncating_support, mode=mode)
+
+def compute_counts(parameters):
+    corpus = parameters['corpus']
+    annotator = parameters['annotator']
+    speakers = ['CHI', 'OCH', 'FEM', 'MAL']
+
+    project = ChildProject(parameters['path'])
+    am = AnnotationManager(project)
+    am.read()
+
+    intersection = AnnotationManager.intersection(
+        am.annotations, ['vtc', annotator]
+    )
+
+    intersection['path'] = intersection.apply(
+        lambda r: opj(project.path, 'annotations', r['set'], 'converted', r['annotation_filename']),
+        axis = 1
+    )
+    datalad.api.get(list(intersection['path'].unique()))
+
+    intersection = intersection.merge(project.recordings[['recording_filename', 'child_id']], how = 'left')
+    intersection['child'] = corpus + '_' + intersection['child_id'].astype(str)
+    intersection['duration'] = intersection['range_offset']-intersection['range_onset']
+    print(corpus, annotator, (intersection['duration']/1000/2).sum()/3600)
+
+    data = []
+    for child, ann in intersection.groupby('child'):
+        #print(corpus, child)
+
+        segments = am.get_collapsed_segments(ann)
+        if 'speaker_type' not in segments.columns:
+            continue
+
+        segments = segments[segments['speaker_type'].isin(speakers)]
+        
+        vtc = {
+            speaker: segments_to_annotation(segments[(segments['set'] == 'vtc') & (segments['speaker_type'] == speaker)], 'speaker_type').get_timeline()
+            for speaker in speakers
+        }
+
+        truth = {
+            speaker: segments_to_annotation(segments[(segments['set'] == annotator) & (segments['speaker_type'] == speaker)], 'speaker_type').get_timeline()
+            for speaker in speakers
+        }
+
+        for speaker_A in speakers:
+            vtc[f'{speaker_A}_vocs_explained'] = vtc[speaker_A].crop(truth[speaker_A], mode = 'loose')
+            vtc[f'{speaker_A}_vocs_fp'] = extrude(vtc[speaker_A], vtc[f'{speaker_A}_vocs_explained'])
+            vtc[f'{speaker_A}_vocs_fn'] = extrude(truth[speaker_A], truth[speaker_A].crop(vtc[speaker_A], mode = 'loose'))
+
+            for speaker_B in speakers:
+                vtc[f'{speaker_A}_vocs_fp_{speaker_B}'] = vtc[f'{speaker_A}_vocs_fp'].crop(truth[speaker_B], mode = 'loose')
+
+                for speaker_C in speakers:
+                    if speaker_C != speaker_B and speaker_C != speaker_A:
+                        vtc[f'{speaker_A}_vocs_fp_{speaker_B}'] = extrude(
+                            vtc[f'{speaker_A}_vocs_fp_{speaker_B}'],
+                            vtc[f'{speaker_A}_vocs_fp_{speaker_B}'].crop(truth[speaker_C], mode = 'loose')
+                        )
+
+
+        d = {}
+        for i, speaker_A in enumerate(speakers):
+            for j, speaker_B in enumerate(speakers):
+                if i != j:
+                    z = len(vtc[f'{speaker_A}_vocs_fp_{speaker_B}'])
+                else:
+                    z = min(len(vtc[f'{speaker_A}_vocs_explained']), len(truth[speaker_A]))
+
+                d[f'vtc_{i}_{j}'] = z
+
+            d[f'truth_{i}'] = len(truth[speaker_A])
+            d['child'] = child
+
+        d['duration'] = ann['duration'].sum()/2/1000
+        data.append(d)
+
+    return pd.DataFrame(data).assign(
+        corpus = corpus,
+    )
+
+stan_code = """
+data {
+  int<lower=1> n_clips;   // number of clips
+  int<lower=1> n_groups; // number of groups
+  int<lower=1> n_classes; // number of classes
+  int group[n_clips];
+  int vtc[n_clips,n_classes,n_classes];
+  int truth[n_clips,n_classes];
+
+  int<lower=1> n_validation;
+  int<lower=1> n_sim;
+
+  real<lower=0> rates_alphas[n_classes];
+  real<lower=0> rates_betas[n_classes];
+}
+
+parameters {
+  matrix<lower=0,upper=1>[n_classes,n_classes] mus;
+  matrix<lower=1>[n_classes,n_classes] etas;
+  matrix<lower=0,upper=1>[n_classes,n_classes] group_confusion[n_groups];
+}
+
+transformed parameters {
+  matrix<lower=0>[n_classes,n_classes] alphas;
+  matrix<lower=0>[n_classes,n_classes] betas;
+
+  alphas = mus * etas;
+  betas = (1-mus) * etas;
+}
+
+model {
+    for (k in n_validation:n_clips) {
+        for (i in 1:n_classes) {
+            for (j in 1:n_classes) {
+                vtc[k,i,j] ~ binomial(truth[k,j], group_confusion[group[k],j,i]);
+            }
+        }
+    }
+
+    for (i in 1:n_classes) {
+        for (j in 1:n_classes) {
+            mus[i,j] ~ beta(1,1);
+            etas[i,j] ~ pareto(1,1.5);
+        }
+    }
+
+    for (c in 1:n_groups) {
+        for (i in 1:n_classes) {
+            for (j in 1:n_classes) {
+                group_confusion[c,i,j] ~ beta(alphas[i,j], betas[i,j]);
+            }
+        }
+    }
+}
+
+generated quantities {
+    int pred[n_clips,n_classes,n_classes];
+    matrix[n_classes,n_classes] probs[n_groups];
+    matrix[n_classes,n_classes] log_lik[n_clips];
+
+    int sim_truth[n_sim,n_classes];
+    int sim_vtc[n_sim,n_classes];
+    vector[n_classes] lambdas;
+    real chi_adu_coef;
+
+    if (uniform_rng(0,1) > 0.99) {
+        chi_adu_coef = uniform_rng(0,1);
+    }
+    else {
+        chi_adu_coef = 0;
+    }
+
+    for (c in 1:n_groups) {
+        for (i in 1:n_classes) {
+            for (j in 1:n_classes) {
+                probs[c,i,j] = beta_rng(alphas[i,j], betas[i,j]);
+            }
+        }
+    }
+
+    for (k in 1:n_clips) {
+        for (i in 1:n_classes) {
+            for (j in 1:n_classes) {
+                if (k >= n_validation) {
+                    pred[k,i,j] = binomial_rng(truth[k,j], group_confusion[group[k],i,j]);
+                    log_lik[k,i,j] = binomial_lpmf(vtc[k,i,j] | truth[k,j], group_confusion[group[k],j,i]);
+                }
+                else {
+                    pred[k,i,j] = binomial_rng(truth[k,j], probs[group[k],j,i]);
+                    log_lik[k,i,j] = beta_lpdf(probs[group[k],j,i] | alphas[j,i], betas[j,i]);
+                    log_lik[k,i,j] += binomial_lpmf(vtc[k,i,j] | truth[k,j], probs[group[k],j,i]);
+                }
+            }
+        }
+    }
+
+    real lambda;
+    for (k in 1:n_sim) {
+        for (i in 2:n_classes) {
+            lambda = gamma_rng(rates_alphas[i], rates_betas[i]);
+            sim_truth[k,i] = poisson_rng(lambda);
+        }
+        lambda = gamma_rng(rates_alphas[1], rates_betas[1]);
+        sim_truth[k,1] = poisson_rng(lambda + chi_adu_coef*(sim_truth[k,3]+sim_truth[k,4]));
+    }
+
+    for (k in 1:n_sim) {
+        for (i in 1:n_classes) {
+            sim_vtc[k,i] = 0;
+            for (j in 1:n_classes) {
+                real p = beta_rng(alphas[j,i], betas[j,i]);
+                sim_vtc[k,i] += binomial_rng(sim_truth[k,j], p);
+            }
+        }
+    }
+}
+"""
+
+if __name__ == "__main__":
+    annotators = pd.read_csv('input/annotators.csv')
+    annotators['path'] = annotators['corpus'].apply(lambda c: opj('input', c))
+
+    with mp.Pool(processes = 8) as pool:
+        data = pd.concat(pool.map(compute_counts, annotators.to_dict(orient = 'records')))
+
+    data = data.sample(frac = 1)
+    duration = data['duration'].sum()
+
+    vtc = np.moveaxis([[data[f'vtc_{j}_{i}'].values for i in range(4)] for j in range(4)], -1, 0)
+    truth = np.transpose([data[f'truth_{i}'].values for i in range(4)])
+
+    print(vtc.shape)
+
+    rates = pd.read_csv('output/speech_dist.csv')
+
+    data = {
+        'n_clips': truth.shape[0],
+        'n_classes': truth.shape[1],
+        'n_groups': data[args.group].nunique(),
+        'n_validation': max(1, int(truth.shape[0]*args.validation)),
+        'n_sim': 40,
+        'group': 1+data[args.group].astype('category').cat.codes.values,
+        'truth': truth.astype(int),
+        'vtc': vtc.astype(int),
+        'rates_alphas': rates['alpha'].values,
+        'rates_betas': rates['beta'].values
+    }
+
+    print(f"clips: {data['n_clips']}")
+    print(f"groups: {data['n_groups']}")
+    print("true vocs: {}".format(np.sum(data['truth'])))
+    print("vtc vocs: {}".format(np.sum(data['vtc'])))
+    print("duration: {}".format(duration))
+
+    with open(f'data_{args.output}.pickle', 'wb') as fp:
+        pickle.dump(data, fp, pickle.HIGHEST_PROTOCOL)
+
+    posterior = stan.build(stan_code, data = data)
+    fit = posterior.sample(num_chains = args.chains, num_samples = args.samples)
+    df = fit.to_frame()
+    df.to_parquet(f'fit_{args.output}.parquet')
+
+

+ 144 - 0
code/plots/vanuatu_null_corr.py

@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+
+import pandas as pd
+import pickle
+import numpy as np
+import pyarrow.parquet as pq
+
+import argparse
+import matplotlib
+import matplotlib.pyplot as plt
+matplotlib.use("pgf")
+matplotlib.rcParams.update({
+    "pgf.texsystem": "xelatex",
+    'font.family': 'serif',
+    "font.serif" : "Times New Roman",
+    'text.usetex': True,
+    'pgf.rcfonts': False,
+})
+
+from sklearn.linear_model import LinearRegression
+
+from tqdm import tqdm
+
+def set_size(width, fraction=1, ratio = None):
+    fig_width_pt = width * fraction
+    inches_per_pt = 1 / 72.27
+    if ratio is None:
+        ratio = (5 ** 0.5 - 1) / 2
+    fig_width_in = fig_width_pt * inches_per_pt
+    fig_height_in = fig_width_in * ratio
+    return fig_width_in, fig_height_in
+
+
+df = pd.read_csv("input/vanuatu-paper/correlations.csv").set_index("speaker")
+df = df.to_dict(orient="index")
+print(df)
+
+parser = argparse.ArgumentParser(description = 'plot_sim_null')
+parser.add_argument('--data', nargs='+')
+parser.add_argument('--fit', nargs='+')
+parser.add_argument('--output')
+args = parser.parse_args()
+
+data = []
+
+for i, _data in enumerate(args.data):
+    with open(_data, 'rb') as fp:
+        data.append(pickle.load(fp))
+
+n_samples = 0
+
+for i, _fit in enumerate(args.fit):
+    table = pq.read_table(_fit, columns=[])
+    n_samples += table.num_rows
+
+
+speakers = ['CHI', 'OCH', 'FEM', 'MAL']
+
+#n_simulations = fit['n_sim'].iloc[0]
+n_sim = data[0]['n_sim']
+n_sim = 38
+
+sim_beta_fem = np.zeros(n_samples)
+sim_beta_mal = np.zeros(n_samples)
+sim_beta_och = np.zeros(n_samples)
+sim_r_fem = np.zeros(n_samples)
+sim_r_mal = np.zeros(n_samples)
+sim_r_och = np.zeros(n_samples)
+true = np.zeros(n_samples)
+
+
+pbar = tqdm(total=n_samples)
+
+seek = 0
+for i, _fit in enumerate(args.fit):
+    fit = pd.read_parquet(_fit)
+    fit = fit[fit['chi_adu_coef'] < 0.0001]
+
+    for j in range(len(fit)):
+        f = fit.iloc[j]
+
+        chi_vtc = np.array([f[f'sim_vtc.{k+1}.1'] for k in range(n_sim)])
+        fem_vtc = np.array([f[f'sim_vtc.{k+1}.3'] for k in range(n_sim)])
+        mal_vtc = np.array([f[f'sim_vtc.{k+1}.4'] for k in range(n_sim)])
+        och_vtc = np.array([f[f'sim_vtc.{k+1}.2'] for k in range(n_sim)])
+
+        regr = LinearRegression()
+        regr.fit(fem_vtc.reshape(-1, 1), chi_vtc)
+        sim_beta_fem[seek+j] = regr.coef_[0]
+        sim_r_fem[seek+j] = np.corrcoef(fem_vtc, chi_vtc)[0,1]
+
+        regr = LinearRegression()
+        regr.fit(mal_vtc.reshape(-1, 1), chi_vtc)
+        sim_beta_mal[seek+j] = regr.coef_[0]
+        sim_r_mal[seek+j] = np.corrcoef(mal_vtc, chi_vtc)[0,1]
+
+        regr = LinearRegression()
+        regr.fit(och_vtc.reshape(-1, 1), chi_vtc)
+        sim_beta_och[seek+j] = regr.coef_[0]
+        sim_r_och[seek+j] = np.corrcoef(och_vtc, chi_vtc)[0,1]
+
+        pbar.update(1)   
+
+    seek += len(fit)
+
+pbar.close()
+
+fig, ax = plt.subplots(1,2,figsize=set_size(450,1,0.5))
+ax[0].hist(sim_beta_fem, histtype = 'step', bins = 20, density = True, label = '$\\mathrm{CHI} = K \\mathrm{FEM} + I$')
+ax[0].hist(sim_beta_mal, histtype = 'step', bins = 20, density = True, label = '$\\mathrm{CHI} = K \\mathrm{MAL} + I$')
+ax[0].hist(sim_beta_och, histtype = 'step', bins = 20, density = True, label = '$\\mathrm{CHI} = K \\mathrm{OCH} + I$')
+ax[0].set_xlabel('$\\hat{K}$')
+ax[0].set_ylabel('$p(\\hat{K}|K=0)$')
+ax[0].legend(loc = 'upper left', bbox_to_anchor=[0, 1.25], frameon=False)
+ax[0].set_xlim(-1,1)
+
+ax[1].hist(sim_r_fem, histtype = 'step', bins = 20, density = True, label = '$R(\\mathrm{CHI},\\mathrm{FEM})$')
+ax[1].hist(sim_r_mal, histtype = 'step', bins = 20, density = True, label = '$R(\\mathrm{CHI},\\mathrm{MAL})$')
+ax[1].hist(sim_r_och, histtype = 'step', bins = 20, density = True, label = '$R(\\mathrm{CHI},\\mathrm{OCH})$')
+ax[1].set_xlabel('$R$')
+ax[1].set_ylabel('$p(R|K=0)$')
+ax[1].legend(loc = 'upper left', bbox_to_anchor=[0, 1.25], frameon=False)
+ax[1].set_xlim(-1,1)
+
+fig.savefig(args.output, bbox_inches = 'tight')
+
+df = pd.read_csv("input/vanuatu-paper/correlations.csv").set_index("speaker")
+df = df.to_dict(orient="index")
+
+from scipy import stats
+
+
+df["fem"]["p"] = 100-stats.percentileofscore(sim_r_fem, df["fem"]["R"])
+df["mal"]["p"] = 100-stats.percentileofscore(sim_r_mal, df["mal"]["R"])
+df["och"]["p"] = 100-stats.percentileofscore(sim_r_och, df["och"]["R"])
+
+print(len(sim_r_och[sim_r_och > df["och"]["R"]]))
+
+print(df)
+
+df = pd.DataFrame(df)
+print(df)
+
+df.to_csv("vanuatu_probs.csv")