Lucas Gautheron 1 рік тому
батько
коміт
b2325fb73e

+ 371 - 0
code/models/corpus_bias.py

@@ -0,0 +1,371 @@
+#!/usr/bin/env python3
+
+from ChildProject.projects import ChildProject
+from ChildProject.annotations import AnnotationManager
+from ChildProject.metrics import segments_to_annotation
+
+import argparse
+
+import datalad.api
+from os.path import join as opj
+from os.path import basename, exists
+
+import multiprocessing as mp
+
+import numpy as np
+import pandas as pd
+import pickle
+from pyannote.core import Annotation, Segment, Timeline
+
+import stan
+
+parser = argparse.ArgumentParser(
+    description="main model described throughout the notes."
+)
+# parser.add_argument("--group", default="child", choices=["corpus", "child"])
+parser.add_argument("--apply-bias-from", type=str, default="")
+parser.add_argument("--chains", default=4, type=int)
+parser.add_argument("--samples", default=2000, type=int)
+parser.add_argument("--validation", default=0, type=float)
+parser.add_argument("--output", default="corpus_bias")
+args = parser.parse_args()
+
+
+def extrude(self, removed, mode: str = "intersection"):
+    if isinstance(removed, Segment):
+        removed = Timeline([removed])
+
+    truncating_support = removed.gaps(support=self.extent())
+    # loose for truncate means strict for crop and vice-versa
+    if mode == "loose":
+        mode = "strict"
+    elif mode == "strict":
+        mode = "loose"
+
+    return self.crop(truncating_support, mode=mode)
+
+
+def compute_counts(parameters):
+    corpus = parameters["corpus"]
+    annotator = parameters["annotator"]
+    speakers = ["CHI", "OCH", "FEM", "MAL"]
+
+    project = ChildProject(parameters["path"])
+    am = AnnotationManager(project)
+    am.read()
+
+    intersection = AnnotationManager.intersection(am.annotations, ["vtc", annotator])
+
+    intersection["path"] = intersection.apply(
+        lambda r: opj(
+            project.path, "annotations", r["set"], "converted", r["annotation_filename"]
+        ),
+        axis=1,
+    )
+    datalad.api.get(list(intersection["path"].unique()))
+
+    intersection = intersection.merge(
+        project.recordings[["recording_filename", "child_id"]], how="left"
+    )
+    intersection["child"] = corpus + "_" + intersection["child_id"].astype(str)
+    intersection["duration"] = (
+        intersection["range_offset"] - intersection["range_onset"]
+    )
+    print(corpus, annotator, (intersection["duration"] / 1000 / 2).sum() / 3600)
+
+    data = []
+    for child, ann in intersection.groupby("child"):
+        # print(corpus, child)
+
+        segments = am.get_collapsed_segments(ann)
+        if "speaker_type" not in segments.columns:
+            continue
+
+        segments = segments[segments["speaker_type"].isin(speakers)]
+
+        vtc = {
+            speaker: segments_to_annotation(
+                segments[
+                    (segments["set"] == "vtc") & (segments["speaker_type"] == speaker)
+                ],
+                "speaker_type",
+            ).get_timeline()
+            for speaker in speakers
+        }
+
+        truth = {
+            speaker: segments_to_annotation(
+                segments[
+                    (segments["set"] == annotator)
+                    & (segments["speaker_type"] == speaker)
+                ],
+                "speaker_type",
+            ).get_timeline()
+            for speaker in speakers
+        }
+
+        for speaker_A in speakers:
+            vtc[f"{speaker_A}_vocs_explained"] = vtc[speaker_A].crop(
+                truth[speaker_A], mode="loose"
+            )
+            vtc[f"{speaker_A}_vocs_fp"] = extrude(
+                vtc[speaker_A], vtc[f"{speaker_A}_vocs_explained"]
+            )
+            vtc[f"{speaker_A}_vocs_fn"] = extrude(
+                truth[speaker_A], truth[speaker_A].crop(vtc[speaker_A], mode="loose")
+            )
+
+            for speaker_B in speakers:
+                vtc[f"{speaker_A}_vocs_fp_{speaker_B}"] = vtc[
+                    f"{speaker_A}_vocs_fp"
+                ].crop(truth[speaker_B], mode="loose")
+
+                for speaker_C in speakers:
+                    if speaker_C != speaker_B and speaker_C != speaker_A:
+                        vtc[f"{speaker_A}_vocs_fp_{speaker_B}"] = extrude(
+                            vtc[f"{speaker_A}_vocs_fp_{speaker_B}"],
+                            vtc[f"{speaker_A}_vocs_fp_{speaker_B}"].crop(
+                                truth[speaker_C], mode="loose"
+                            ),
+                        )
+
+        d = {}
+        for i, speaker_A in enumerate(speakers):
+            for j, speaker_B in enumerate(speakers):
+                if i != j:
+                    z = len(vtc[f"{speaker_A}_vocs_fp_{speaker_B}"])
+                else:
+                    z = min(
+                        len(vtc[f"{speaker_A}_vocs_explained"]), len(truth[speaker_A])
+                    )
+
+                d[f"vtc_{i}_{j}"] = z
+
+            d[f"truth_{i}"] = len(truth[speaker_A])
+            d["child"] = child
+
+        d["duration"] = ann["duration"].sum() / 2 / 1000
+        data.append(d)
+
+    return pd.DataFrame(data).assign(
+        corpus=corpus,
+    )
+
+
+stan_code = """
+data {
+  int<lower=1> n_clips;   // number of clips
+  int<lower=1> n_groups; // number of groups
+  int<lower=1> n_corpora;
+  int<lower=1> n_classes; // number of classes
+  int group[n_clips];
+  int corpus[n_clips];
+  int vtc[n_clips,n_classes,n_classes];
+  int truth[n_clips,n_classes];
+
+  int<lower=1> n_validation;
+  int<lower=1> n_sim;
+  int<lower=0> selected_corpus;
+
+  real<lower=0> rates_alphas[n_classes];
+  real<lower=0> rates_betas[n_classes];
+}
+
+parameters {
+  matrix<lower=0,upper=1>[n_classes,n_classes] mus;
+  matrix<lower=1>[n_classes,n_classes] etas;
+  matrix<lower=0,upper=1>[n_classes,n_classes] group_confusion[n_groups];
+  matrix[n_classes,n_classes] corpus_bias[n_corpora];
+
+  matrix<lower=0>[n_classes,n_classes] corpus_sigma;
+}
+
+transformed parameters {
+  matrix<lower=0>[n_classes,n_classes] alphas;
+  matrix<lower=0>[n_classes,n_classes] betas;
+
+  alphas = mus * etas;
+  betas = (1-mus) * etas;
+}
+
+model {
+    for (k in n_validation:n_clips) {
+        for (i in 1:n_classes) {
+            for (j in 1:n_classes) {
+                vtc[k,i,j] ~ binomial(
+                    truth[k,j], inv_logit(logit(group_confusion[group[k],j,i]) + corpus_bias[corpus[k],j,i])
+                );
+            }
+        }
+    }
+
+    for (i in 1:n_classes) {
+        for (j in 1:n_classes) {
+            mus[i,j] ~ beta(1,1);
+            etas[i,j] ~ pareto(1,1.5);
+        }
+    }
+
+    for (c in 1:n_groups) {
+        for (i in 1:n_classes) {
+            for (j in 1:n_classes) {
+                group_confusion[c,i,j] ~ beta(alphas[i,j], betas[i,j]);
+            }
+        }
+    }
+
+    for (i in 1:n_classes) {
+        for (j in 1:n_classes) {
+            for (c in 1:n_corpora) {
+                corpus_bias[c,j,i] ~ normal(0, corpus_sigma[j,i]);
+            }
+            corpus_sigma[j,i] ~ normal(0, 1);
+        }
+    }    
+}
+
+generated quantities {
+    int pred[n_clips,n_classes,n_classes];
+    matrix[n_classes,n_classes] probs[n_groups];
+    matrix[n_classes,n_classes] log_lik[n_clips];
+
+    matrix[n_classes,n_classes] random_bias;
+    matrix[n_classes,n_classes] fixed_bias;
+
+    int sim_truth[n_sim,n_classes];
+    int sim_vtc[n_sim,n_classes];
+    vector[n_classes] lambdas;
+    real chi_adu_coef = 0; // null-hypothesis
+
+    for (i in 1:n_classes) {
+        for (j in 1:n_classes) {
+            if (selected_corpus != 0) {
+                fixed_bias[j, i] = corpus_bias[selected_corpus, j, i];
+            }
+            else {
+                fixed_bias[j, i] = 0;
+            }
+            random_bias[j,i] = normal_rng(0, corpus_sigma[j,i]);
+        }
+    }
+
+    for (c in 1:n_groups) {
+        for (i in 1:n_classes) {
+            for (j in 1:n_classes) {
+                probs[c,i,j] = beta_rng(alphas[i,j], betas[i,j]);
+            }
+        }
+    }
+
+    for (k in 1:n_clips) {
+        for (i in 1:n_classes) {
+            for (j in 1:n_classes) {
+                if (k >= n_validation) {
+                    pred[k,i,j] = binomial_rng(truth[k,j], inv_logit(logit(group_confusion[group[k],j,i]) + corpus_bias[corpus[k], j,i]));
+                    log_lik[k,i,j] = binomial_lpmf(
+                        vtc[k,i,j] | truth[k,j], inv_logit(logit(group_confusion[group[k],j,i]) + corpus_bias[corpus[k], j,i])
+                    );
+                }
+                else {
+                    pred[k,i,j] = binomial_rng(
+                        truth[k,j], inv_logit(logit(probs[group[k],j,i]) + corpus_bias[corpus[k], j,i])
+                    );
+                    log_lik[k,i,j] = beta_lpdf(probs[group[k],j,i] | alphas[j,i], betas[j,i]);
+                    log_lik[k,i,j] += binomial_lpmf(
+                        vtc[k,i,j] | truth[k,j], inv_logit(logit(probs[group[k],j,i]) + corpus_bias[corpus[k], j,i])
+                    );
+                }
+            }
+        }
+    }
+
+    real lambda;
+    for (k in 1:n_sim) {
+        for (i in 2:n_classes) {
+            lambda = gamma_rng(rates_alphas[i], rates_betas[i]);
+            sim_truth[k,i] = poisson_rng(lambda);
+        }
+        lambda = gamma_rng(rates_alphas[1], rates_betas[1]);
+        sim_truth[k,1] = poisson_rng(lambda + chi_adu_coef*(sim_truth[k,3]+sim_truth[k,4]));
+    }
+
+    for (k in 1:n_sim) {
+        for (i in 1:n_classes) {
+            sim_vtc[k,i] = 0;
+            for (j in 1:n_classes) {
+                real p = logit(beta_rng(alphas[j,i], betas[j,i]));
+
+                if (selected_corpus != 0) {
+                    p += fixed_bias[j,i];
+                }
+                else {
+                    p += random_bias[j,i];
+                }
+                p = inv_logit(p);
+                sim_vtc[k,i] += binomial_rng(sim_truth[k,j], p);
+            }
+        }
+    }
+}
+"""
+
+if __name__ == "__main__":
+    annotators = pd.read_csv("input/annotators.csv")
+    annotators["path"] = annotators["corpus"].apply(lambda c: opj("input", c))
+
+    with mp.Pool(processes=8) as pool:
+        data = pd.concat(pool.map(compute_counts, annotators.to_dict(orient="records")))
+
+    data = data.sample(frac=1)
+    duration = data["duration"].sum()
+
+    vtc = np.moveaxis(
+        [[data[f"vtc_{j}_{i}"].values for i in range(4)] for j in range(4)], -1, 0
+    )
+    truth = np.transpose([data[f"truth_{i}"].values for i in range(4)])
+
+    print(vtc.shape)
+
+    rates = pd.read_csv("output/speech_dist.csv")
+
+    data["corpus"] = data["corpus"].astype("category")
+    corpora = data["corpus"].cat.codes.values
+    corpora_codes = dict(enumerate(data["corpus"].cat.categories))
+    corpora_codes = {v: k for k, v in corpora_codes.items()}
+
+    data = {
+        "n_clips": truth.shape[0],
+        "n_classes": truth.shape[1],
+        "n_groups": data["child"].nunique(),
+        "n_corpora": data["corpus"].nunique(),
+        "n_validation": max(1, int(truth.shape[0] * args.validation)),
+        "n_sim": 40,
+        "group": 1 + data["child"].astype("category").cat.codes.values,
+        "corpus": 1 + corpora,
+        "selected_corpus": (
+            1 + corpora_codes[args.apply_bias_from]
+            if args.apply_bias_from in corpora_codes
+            else 0
+        ),
+        "truth": truth.astype(int),
+        "vtc": vtc.astype(int),
+        "rates_alphas": rates["alpha"].values,
+        "rates_betas": rates["beta"].values,
+    }
+
+    print(f"clips: {data['n_clips']}")
+    print(f"groups: {data['n_groups']}")
+    print("true vocs: {}".format(np.sum(data["truth"])))
+    print("vtc vocs: {}".format(np.sum(data["vtc"])))
+    print("duration: {}".format(duration))
+
+    print("selected corpus: {}".format(data["selected_corpus"]))
+
+    with open(f"output/samples/data_{args.output}.pickle", "wb") as fp:
+        pickle.dump(data, fp, pickle.HIGHEST_PROTOCOL)
+
+    posterior = stan.build(stan_code, data=data)
+    fit = posterior.sample(num_chains=args.chains, num_samples=args.samples)
+    df = fit.to_frame()
+    df.to_parquet(f"output/samples/fit_{args.output}.parquet")
+

+ 55 - 38
code/plots/confusion_probs.py

@@ -3,66 +3,83 @@
 import pandas as pd
 import pickle
 import numpy as np
+from scipy.special import logit, expit
 
 import argparse
 import matplotlib
 import matplotlib.pyplot as plt
+
 matplotlib.use("pgf")
-matplotlib.rcParams.update({
-    "pgf.texsystem": "pdflatex",
-    'font.family': 'serif',
-    "font.serif" : "Times New Roman",
-    'text.usetex': True,
-    'pgf.rcfonts': False,
-})
-
-def set_size(width, fraction=1, ratio = None):
+matplotlib.rcParams.update(
+    {
+        "pgf.texsystem": "pdflatex",
+        "font.family": "serif",
+        "font.serif": "Times New Roman",
+        "text.usetex": True,
+        "pgf.rcfonts": False,
+    }
+)
+
+
+def set_size(width, fraction=1, ratio=None):
     fig_width_pt = width * fraction
     inches_per_pt = 1 / 72.27
     if ratio is None:
-        ratio = (5 ** 0.5 - 1) / 2
+        ratio = (5**0.5 - 1) / 2
     fig_width_in = fig_width_pt * inches_per_pt
     fig_height_in = fig_width_in * ratio
     return fig_width_in, fig_height_in
 
-parser = argparse.ArgumentParser(description = 'plot_pred')
-parser.add_argument('data')
-parser.add_argument('fit')
-parser.add_argument('output')
+
+parser = argparse.ArgumentParser(description="plot_pred")
+parser.add_argument("data")
+parser.add_argument("fit")
+parser.add_argument("output")
 args = parser.parse_args()
 
-with open(args.data, 'rb') as fp:
+with open(args.data, "rb") as fp:
     data = pickle.load(fp)
 
 fit = pd.read_parquet(args.fit)
 
 fig = plt.figure(figsize=set_size(450, 1, 1))
-axes = [fig.add_subplot(4,4,i+1) for i in range(4*4)]
+axes = [fig.add_subplot(4, 4, i + 1) for i in range(4 * 4)]
 
-speakers = ['CHI', 'OCH', 'FEM', 'MAL']
+speakers = ["CHI", "OCH", "FEM", "MAL"]
 
-n_groups = data['n_groups']
+n_groups = data["n_groups"]
 
-for i in range(4*4):
+for i in range(4 * 4):
     ax = axes[i]
-    row = i//4+1
-    col = i%4+1
-    label = f'{col}.{row}'
+    row = i // 4 + 1
+    col = i % 4 + 1
+    label = f"{col}.{row}"
 
-    #if args.group is None:
+    # if args.group is None:
     #    data = np.hstack([fit[f'alphas.{k}.{label}']/(fit[f'alphas.{k}.{label}']+fit[f'betas.{k}.{label}']).values for k in range(1,n_groups+1)])
-    #else:
+    # else:
     #    data = fit[f'alphas.{args.group}.{label}']/(fit[f'alphas.{args.group}.{label}']+fit[f'betas.{args.group}.{label}']).values
-    #data = np.hstack([(fit[f'group_mus.{k}.{label}']).values for k in range(1,59)])
-    #data = fit[f'mus.{label}'].values
-    data = np.hstack([fit[f'probs.{k+1}.{label}'].values for k in range(n_groups)])
-  
+    # data = np.hstack([(fit[f'group_mus.{k}.{label}']).values for k in range(1,59)])
+    # data = fit[f'mus.{label}'].values
+    if "bias.1.1" in fit.columns:
+        data = expit(
+            np.hstack(
+                [
+                    logit(fit[f"probs.{k+1}.{label}"].values)
+                    + fit[f"fixed_bias.{label}"].values
+                    for k in range(n_groups)
+                ]
+            )
+        )
+    else:
+        data = np.hstack([fit[f"probs.{k+1}.{label}"].values for k in range(n_groups)])
+
     ax.set_xticks([])
     ax.set_xticklabels([])
     ax.set_yticks([])
     ax.set_yticklabels([])
-    ax.set_ylim(0,5)
-    ax.set_xlim(0,1)
+    ax.set_ylim(0, 5)
+    ax.set_xlim(0, 1)
 
     low = np.quantile(data, 0.0275)
     high = np.quantile(data, 0.975)
@@ -70,22 +87,22 @@ for i in range(4*4):
     if row == 1:
         ax.xaxis.tick_top()
         ax.set_xticks([0.5])
-        ax.set_xticklabels([speakers[col-1]])
+        ax.set_xticklabels([speakers[col - 1]])
 
     if row == 4:
-        ax.set_xticks(np.linspace(0.25,1,3, endpoint = False))
-        ax.set_xticklabels(np.linspace(0.25,1,3, endpoint = False))
+        ax.set_xticks(np.linspace(0.25, 1, 3, endpoint=False))
+        ax.set_xticklabels(np.linspace(0.25, 1, 3, endpoint=False))
 
     if col == 1:
         ax.set_yticks([2.5])
-        ax.set_yticklabels([speakers[row-1]])
+        ax.set_yticklabels([speakers[row - 1]])
 
-    ax.hist(data, bins = np.linspace(0,1,40), density = True, histtype = 'step')
-    ax.axvline(np.mean(data), linestyle = '--', linewidth = 0.5, color = '#333', alpha = 1)
-    ax.text(0.5, 4.5, f'{low:.2f} - {high:.2f}', ha = 'center', va = 'center')
+    ax.hist(data, bins=np.linspace(0, 1, 40), density=True, histtype="step")
+    ax.axvline(np.mean(data), linestyle="--", linewidth=0.5, color="#333", alpha=1)
+    ax.text(0.5, 4.5, f"{low:.2f} - {high:.2f}", ha="center", va="center")
 
 fig.suptitle("$p_{ij}$ distribution")
-fig.subplots_adjust(wspace = 0, hspace = 0)
+fig.subplots_adjust(wspace=0, hspace=0)
 
 plt.savefig(args.output)
 

+ 1 - 0
output/samples/data_corpus_bias.pickle

@@ -0,0 +1 @@
+../../.git/annex/objects/xW/34/MD5E-s23298--08ca5b5537789028f0486f69089ab5ef/MD5E-s23298--08ca5b5537789028f0486f69089ab5ef

+ 1 - 0
output/samples/fit_corpus_bias.parquet

@@ -0,0 +1 @@
+../../.git/annex/objects/1q/2q/MD5E-s226526446--0bd0c7f8f1085e21ace4f3fc38768967/MD5E-s226526446--0bd0c7f8f1085e21ace4f3fc38768967