2 years ago · 5d1b14577b
--- a/code/models/annotators_errors.py
+++ b/code/models/annotators_errors.py
@@ -0,0 +1,334 @@
 
				+#!/usr/bin/env python3
			
 
				+
			
 
				+from ChildProject.projects import ChildProject
			
 
				+from ChildProject.annotations import AnnotationManager
			
 
				+from ChildProject.metrics import segments_to_annotation
			
 
				+
			
 
				+import argparse
			
 
				+
			
 
				+import datalad.api
			
 
				+from os.path import join as opj
			
 
				+from os.path import basename, exists
			
 
				+from functools import partial
			
 
				+from itertools import combinations
			
 
				+import copy
			
 
				+
			
 
				+import multiprocessing as mp
			
 
				+
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+import pickle
			
 
				+from pyannote.core import Annotation, Segment, Timeline
			
 
				+
			
 
				+import stan
			
 
				+
			
 
				+parser = argparse.ArgumentParser(description="model3")
			
 
				+parser.add_argument("--group", default="child", choices=["corpus", "child"])
			
 
				+parser.add_argument("--chains", default=4, type=int)
			
 
				+parser.add_argument("--samples", default=2000, type=int)
			
 
				+parser.add_argument("--require-agreement", action='store_true')
			
 
				+args = parser.parse_args()
			
 
				+
			
 
				+
			
 
				+def extrude(self, removed, mode: str = "intersection"):
			
 
				+    if isinstance(removed, Segment):
			
 
				+        removed = Timeline([removed])
			
 
				+
			
 
				+    truncating_support = removed.gaps(support=self.extent())
			
 
				+    # loose for truncate means strict for crop and vice-versa
			
 
				+    if mode == "loose":
			
 
				+        mode = "strict"
			
 
				+    elif mode == "strict":
			
 
				+        mode = "loose"
			
 
				+
			
 
				+    return self.crop(truncating_support, mode=mode)
			
 
				+
			
 
				+
			
 
				+def largest_intersection(corpus):
			
 
				+    corpus, annotators = corpus
			
 
				+
			
 
				+    project = ChildProject(annotators["path"].iloc[0])
			
 
				+    am = AnnotationManager(project)
			
 
				+    am.read()
			
 
				+    
			
 
				+    largest_duration = 0
			
 
				+    best_pair = None
			
 
				+    for x, y in combinations(annotators['annotator'].values, r = 2):
			
 
				+        intersection = am.intersection(am.annotations, [x, y])
			
 
				+        intersection['duration'] = intersection['range_offset']-intersection['range_onset']
			
 
				+        duration = intersection['duration'].sum()/2
			
 
				+
			
 
				+        if duration > largest_duration:
			
 
				+            best_pair = (x, y)
			
 
				+            largest_duration = duration
			
 
				+
			
 
				+    if best_pair is None:
			
 
				+        return None
			
 
				+    else:
			
 
				+        return annotators[annotators['annotator'].isin(best_pair)]
			
 
				+
			
 
				+
			
 
				+def process_pair(pair):
			
 
				+    corpus, parameters = pair
			
 
				+    annotators = parameters["annotator"].values
			
 
				+    speakers = ["CHI", "OCH", "FEM", "MAL"]
			
 
				+
			
 
				+    project = ChildProject(parameters["path"].iloc[0])
			
 
				+    am = AnnotationManager(project)
			
 
				+    am.read()
			
 
				+
			
 
				+    intersection = AnnotationManager.intersection(am.annotations, [annotators[0], annotators[1], 'vtc'])
			
 
				+
			
 
				+    intersection["path"] = intersection.apply(
			
 
				+        lambda r: opj(
			
 
				+            project.path, "annotations", r["set"], "converted", r["annotation_filename"]
			
 
				+        ),
			
 
				+        axis=1,
			
 
				+    )
			
 
				+    datalad.api.get(list(intersection["path"].unique()))
			
 
				+
			
 
				+    intersection = intersection.merge(
			
 
				+        project.recordings[["recording_filename", "child_id"]], how="left"
			
 
				+    )
			
 
				+    intersection["child"] = corpus + "_" + intersection["child_id"].astype(str)
			
 
				+
			
 
				+    data = []
			
 
				+    for child, ann in intersection.groupby("child"):
			
 
				+        print(corpus, child)
			
 
				+
			
 
				+        segments = am.get_collapsed_segments(ann)
			
 
				+        if "speaker_type" not in segments.columns:
			
 
				+            continue
			
 
				+
			
 
				+        segments = segments[segments["speaker_type"].isin(speakers)]
			
 
				+
			
 
				+        vtc = {
			
 
				+            speaker: segments_to_annotation(
			
 
				+                segments[
			
 
				+                    (segments["set"] == "vtc") & (segments["speaker_type"] == speaker)
			
 
				+                ],
			
 
				+                "speaker_type",
			
 
				+            ).get_timeline()
			
 
				+            for speaker in speakers
			
 
				+        }
			
 
				+
			
 
				+        truth1 = {
			
 
				+            speaker: segments_to_annotation(
			
 
				+                segments[
			
 
				+                    (segments["set"] == annotators[0])
			
 
				+                    & (segments["speaker_type"] == speaker)
			
 
				+                ],
			
 
				+                "speaker_type",
			
 
				+            ).get_timeline()
			
 
				+            for speaker in speakers
			
 
				+        }
			
 
				+
			
 
				+        truth2 = {
			
 
				+            speaker: segments_to_annotation(
			
 
				+                segments[
			
 
				+                    (segments["set"] == annotators[1])
			
 
				+                    & (segments["speaker_type"] == speaker)
			
 
				+                ],
			
 
				+                "speaker_type",
			
 
				+            ).get_timeline()
			
 
				+            for speaker in speakers
			
 
				+        }
			
 
				+
			
 
				+        for speaker_A in speakers:
			
 
				+            union = copy.copy(truth1[speaker_A])
			
 
				+            union.update(truth2[speaker_A])
			
 
				+
			
 
				+            vtc[f"{speaker_A}_vocs_explained"] = vtc[speaker_A].crop(
			
 
				+                union, mode="loose"
			
 
				+            )
			
 
				+
			
 
				+            vtc[f"{speaker_A}_vocs_fp"] = extrude(
			
 
				+                vtc[speaker_A], vtc[f"{speaker_A}_vocs_explained"]
			
 
				+            )
			
 
				+            vtc[f"{speaker_A}_vocs_fn"] = extrude(
			
 
				+                union, union.crop(vtc[speaker_A], mode="loose")
			
 
				+            )
			
 
				+
			
 
				+            for speaker_B in speakers:
			
 
				+                intersect_B = truth1[speaker_B]
			
 
				+                intersect_B = intersect_B.crop(truth2[speaker_B], mode="loose")
			
 
				+
			
 
				+                vtc[f"{speaker_A}_vocs_fp_{speaker_B}"] = vtc[
			
 
				+                    f"{speaker_A}_vocs_fp"
			
 
				+                ].crop(intersect_B, mode="loose")
			
 
				+
			
 
				+                for speaker_C in speakers:
			
 
				+                    intersect_C = truth1[speaker_C]
			
 
				+                    intersect_C = intersect_C.crop(truth2[speaker_C], mode="loose")
			
 
				+                    if speaker_C != speaker_B and speaker_C != speaker_A:
			
 
				+                        vtc[f"{speaker_A}_vocs_fp_{speaker_B}"] = extrude(
			
 
				+                            vtc[f"{speaker_A}_vocs_fp_{speaker_B}"],
			
 
				+                            vtc[f"{speaker_A}_vocs_fp_{speaker_B}"].crop(
			
 
				+                                intersect_C, mode="loose"
			
 
				+                            ),
			
 
				+                        )
			
 
				+
			
 
				+        d = {}
			
 
				+        for i, speaker_A in enumerate(speakers):
			
 
				+            truth_len = len(truth1[speaker_A].crop(truth2[speaker_A], mode = "loose"))
			
 
				+            for j, speaker_B in enumerate(speakers):
			
 
				+                if i != j:
			
 
				+                    z = len(vtc[f"{speaker_A}_vocs_fp_{speaker_B}"])
			
 
				+                else:
			
 
				+                    z = min(
			
 
				+                        len(vtc[f"{speaker_A}_vocs_explained"]), truth_len
			
 
				+                    )
			
 
				+
			
 
				+                d[f"vtc_{i}_{j}"] = z
			
 
				+
			
 
				+            d[f"truth_{i}"] = truth_len
			
 
				+            d["child"] = child
			
 
				+
			
 
				+        data.append(d)
			
 
				+
			
 
				+    return pd.DataFrame(data).assign(corpus=corpus)
			
 
				+
			
 
				+
			
 
				+stan_code = """
			
 
				+data {
			
 
				+  int<lower=1> n_clips;   // number of clips
			
 
				+  int<lower=1> n_groups; // number of groups
			
 
				+  int<lower=1> n_classes; // number of classes
			
 
				+  int group[n_clips];
			
 
				+  int vtc[n_clips,n_classes,n_classes];
			
 
				+  int truth[n_clips,n_classes];
			
 
				+
			
 
				+  int<lower=1> n_sim;
			
 
				+
			
 
				+  real<lower=0> rates_alphas[n_classes];
			
 
				+  real<lower=0> rates_betas[n_classes];
			
 
				+}
			
 
				+
			
 
				+parameters {
			
 
				+  matrix<lower=0,upper=1>[n_classes,n_classes] mus;
			
 
				+  matrix<lower=1>[n_classes,n_classes] etas;
			
 
				+  matrix<lower=0,upper=1>[n_classes,n_classes] group_confusion[n_groups];
			
 
				+}
			
 
				+
			
 
				+transformed parameters {
			
 
				+  matrix<lower=0>[n_classes,n_classes] alphas;
			
 
				+  matrix<lower=0>[n_classes,n_classes] betas;
			
 
				+
			
 
				+  alphas = mus * etas;
			
 
				+  betas = (1-mus) * etas;
			
 
				+}
			
 
				+
			
 
				+model {
			
 
				+    for (k in 1:n_clips) {
			
 
				+        for (i in 1:n_classes) {
			
 
				+            for (j in 1:n_classes) {
			
 
				+                vtc[k,i,j] ~ binomial(truth[k,j], group_confusion[group[k],j,i]);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    for (i in 1:n_classes) {
			
 
				+        for (j in 1:n_classes) {
			
 
				+            mus[i,j] ~ beta(1,1);
			
 
				+            etas[i,j] ~ pareto(1,1.5);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    for (c in 1:n_groups) {
			
 
				+        for (i in 1:n_classes) {
			
 
				+            for (j in 1:n_classes) {
			
 
				+                group_confusion[c,i,j] ~ beta(alphas[i,j], betas[i,j]);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+generated quantities {
			
 
				+    matrix[n_classes,n_classes] probs[n_groups];
			
 
				+    int sim_truth[n_sim,n_classes];
			
 
				+    int sim_vtc[n_sim,n_classes];
			
 
				+    vector[n_classes] lambdas;
			
 
				+    real chi_adu_coef;
			
 
				+
			
 
				+    if (uniform_rng(0,1) > 0.2) {
			
 
				+        chi_adu_coef = uniform_rng(0,1);
			
 
				+    }
			
 
				+    else {
			
 
				+        chi_adu_coef = 0;
			
 
				+    }
			
 
				+
			
 
				+    for (c in 1:n_groups) {
			
 
				+        for (i in 1:n_classes) {
			
 
				+            for (j in 1:n_classes) {
			
 
				+                probs[c,i,j] = beta_rng(alphas[i,j], betas[i,j]);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    real lambda;
			
 
				+    for (k in 1:n_sim) {
			
 
				+        for (i in 2:n_classes) {
			
 
				+            lambda = gamma_rng(rates_alphas[i], rates_betas[i]);
			
 
				+            sim_truth[k,i] = poisson_rng(lambda);
			
 
				+        }
			
 
				+        lambda = gamma_rng(rates_alphas[1], rates_betas[1]);
			
 
				+        sim_truth[k,1] = poisson_rng(lambda + chi_adu_coef*(sim_truth[k,3]+sim_truth[k,4]));
			
 
				+    }
			
 
				+
			
 
				+    for (k in 1:n_sim) {
			
 
				+        for (i in 1:n_classes) {
			
 
				+            sim_vtc[k,i] = 0;
			
 
				+            for (j in 1:n_classes) {
			
 
				+                real p = beta_rng(alphas[i,j], betas[i,j]);
			
 
				+                sim_vtc[k,i] += binomial_rng(sim_truth[k,j], p);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+"""
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    annotators = pd.read_csv("input/annotators.csv")
			
 
				+    annotators["path"] = annotators["corpus"].apply(lambda c: opj("input", c))
			
 
				+
			
 
				+    with mp.Pool(processes=8) as pool:
			
 
				+        pairs = pd.concat(pool.map(largest_intersection, annotators.groupby('corpus')))
			
 
				+        print(pairs)
			
 
				+        data = pd.concat(pool.map(process_pair, pairs.groupby('corpus')))
			
 
				+
			
 
				+    data = data.sample(frac=1)
			
 
				+
			
 
				+    vtc = np.moveaxis(
			
 
				+        [[data[f"vtc_{j}_{i}"].values for i in range(4)] for j in range(4)], -1, 0
			
 
				+    )
			
 
				+    truth = np.transpose([data[f"truth_{i}"].values for i in range(4)])
			
 
				+
			
 
				+    print(vtc.shape)
			
 
				+
			
 
				+    rates = pd.read_csv("output/speech_dist.csv")
			
 
				+
			
 
				+    data = {
			
 
				+        "n_clips": truth.shape[0],
			
 
				+        "n_classes": truth.shape[1],
			
 
				+        "n_groups": data[args.group].nunique(),
			
 
				+        "n_sim": 100,
			
 
				+        "group": 1 + data[args.group].astype("category").cat.codes.values,
			
 
				+        "truth": truth.astype(int),
			
 
				+        "vtc": vtc.astype(int),
			
 
				+        "rates_alphas": rates["alpha"].values,
			
 
				+        "rates_betas": rates["beta"].values,
			
 
				+    }
			
 
				+
			
 
				+    print(f"clips: {data['n_clips']}")
			
 
				+    print(f"groups: {data['n_groups']}")
			
 
				+    print("true vocs: {}".format(np.sum(data["truth"])))
			
 
				+    print("vtc vocs: {}".format(np.sum(data["vtc"])))
			
 
				+
			
 
				+    with open("data_model3_errors.pickle", "wb") as fp:
			
 
				+        pickle.dump(data, fp, pickle.HIGHEST_PROTOCOL)
			
 
				+
			
 
				+    posterior = stan.build(stan_code, data=data)
			
 
				+    fit = posterior.sample(num_chains=args.chains, num_samples=args.samples)
			
 
				+    df = fit.to_frame()
			
 
				+    df.to_parquet("fit_model3_errors.parquet")
			
 
				+
			
--- a/code/model3.py
+++ b/code/model3.py
@@ -19,7 +19,7 @@ from pyannote.core import Annotation, Segment, Timeline
 
				 
			
 
				 import stan
			
 
				 
			
 
				-parser = argparse.ArgumentParser(description = 'model3')
			
 
				+parser = argparse.ArgumentParser(description = 'main model described throughout the notes.')
			
 
				 parser.add_argument('--group', default = 'child', choices = ['corpus', 'child'])
			
 
				 parser.add_argument('--chains', default = 4, type = int)
			
 
				 parser.add_argument('--samples', default = 2000, type = int)
			
--- a/code/models/model1.py
+++ b/code/models/model1.py
--- a/code/models/model2.py
+++ b/code/models/model2.py
--- a/code/models/model4.py
+++ b/code/models/model4.py
--- a/code/model5.py
+++ b/code/model5.py
@@ -22,6 +22,7 @@ parser = argparse.ArgumentParser(description = 'model4')
 
				 parser.add_argument('--group', default = 'child', choices = ['corpus', 'child'])
			
 
				 parser.add_argument('--chains', default = 4, type = int)
			
 
				 parser.add_argument('--samples', default = 2000, type = int)
			
 
				+parser.add_argument('--validation', default = 0, type = float)
			
 
				 args = parser.parse_args()
			
 
				 
			
 
				 def extrude(self, removed, mode: str = 'intersection'):
			
@@ -130,6 +131,7 @@ data {
 
				   int group[n_clips];
			
 
				   int vtc[n_clips,n_classes,n_classes];
			
 
				   int truth[n_clips,n_classes];
			
 
				+  int n_validation;
			
 
				 }
			
 
				 
			
 
				 parameters {
			
@@ -167,7 +169,7 @@ transformed parameters {
 
				 }
			
 
				 
			
 
				 model {
			
 
				-    for (k in 1:n_clips) {
			
 
				+    for (k in n_validation:n_clips) {
			
 
				         for (i in 1:n_classes) {
			
 
				             for (j in 1:n_classes) {
			
 
				                 vtc[k,i,j] ~ binomial(truth[k,j], group_confusion[group[k],j,i]);
			
@@ -214,10 +216,7 @@ generated quantities {
 
				 
			
 
				     for (k in 1:n_clips) {
			
 
				         for (i in 1:n_classes) {
			
 
				-            pred[k,i] = 0;
			
 
				-            for (j in 1:n_classes) {
			
 
				-                pred[k,i] += binomial_rng(truth[k,j], probs[group[k],i,j]); 
			
 
				-            }
			
 
				+            pred[k,i] = binomial_rng(truth[k,i], probs[group[k],i,i]);
			
 
				         }
			
 
				     }
			
 
				 }
			
@@ -241,6 +240,7 @@ if __name__ == "__main__":
 
				         'n_clips': truth.shape[0],
			
 
				         'n_classes': truth.shape[1],
			
 
				         'n_groups': data[args.group].nunique(),
			
 
				+        'n_validation': int(truth.shape[0]*args.validation),
			
 
				         'group': 1+data[args.group].astype('category').cat.codes.values,
			
 
				         'truth': truth.astype(int),
			
 
				         'vtc': vtc.astype(int),
			
--- a/output/clips/vtc.txt
+++ b/output/clips/vtc.txt
@@ -1 +0,0 @@
 
				-../../.git/annex/objects/8k/q6/MD5E-s28257--2c9cfbddb53e6c9a11c87c82a0021054.txt/MD5E-s28257--2c9cfbddb53e6c9a11c87c82a0021054.txt
			
--- a/output/clips/vtc.txt
+++ b/output/clips/vtc.txt
@@ -0,0 +1 @@
 
				+/annex/objects/MD5E-s28257--2c9cfbddb53e6c9a11c87c82a0021054.txt
			
--- a/output/p.pdf
+++ b/output/p.pdf
@@ -1 +0,0 @@
 
				-../.git/annex/objects/MZ/KW/MD5E-s48532--9c9ce34f6f5fe7fe3a82dbc278ff4131.pdf/MD5E-s48532--9c9ce34f6f5fe7fe3a82dbc278ff4131.pdf
			
--- a/output/p.pdf
+++ b/output/p.pdf
@@ -0,0 +1 @@
 
				+/annex/objects/MD5E-s48532--9c9ce34f6f5fe7fe3a82dbc278ff4131.pdf
		`@@ -1 +0,0 @@`
		`-../../.git/annex/objects/8k/q6/MD5E-s28257--2c9cfbddb53e6c9a11c87c82a0021054.txt/MD5E-s28257--2c9cfbddb53e6c9a11c87c82a0021054.txt`
		`@@ -0,0 +1 @@`
		`+/annex/objects/MD5E-s28257--2c9cfbddb53e6c9a11c87c82a0021054.txt`
		`@@ -1 +0,0 @@`
		`-../.git/annex/objects/MZ/KW/MD5E-s48532--9c9ce34f6f5fe7fe3a82dbc278ff4131.pdf/MD5E-s48532--9c9ce34f6f5fe7fe3a82dbc278ff4131.pdf`
		`@@ -0,0 +1 @@`
		`+/annex/objects/MD5E-s48532--9c9ce34f6f5fe7fe3a82dbc278ff4131.pdf`