|
@@ -0,0 +1,231 @@
|
|
|
+#!/usr/bin/env python3
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+"""
|
|
|
+Created on Mon Jan 2 14:53:20 2023
|
|
|
+
|
|
|
+@author: lpeurey
|
|
|
+
|
|
|
+Manage the importation of eaf 2022 annotation campaign
|
|
|
+custom converter to import properly
|
|
|
+"""
|
|
|
+import glob
|
|
|
+from pathlib import Path
|
|
|
+
|
|
|
+import numpy as np
|
|
|
+import pandas as pd
|
|
|
+import pympi
|
|
|
+from collections import defaultdict
|
|
|
+import os
|
|
|
+
|
|
|
+from ChildProject.projects import ChildProject
|
|
|
+from ChildProject.annotations import AnnotationManager
|
|
|
+
|
|
|
+SPEAKER_ID_TO_TYPE = defaultdict(
|
|
|
+ lambda: "NA",
|
|
|
+ {
|
|
|
+ "CHI": "CHI",
|
|
|
+ "FEM": "FEM",
|
|
|
+ "MAL": "MAL",
|
|
|
+ "OCH": "OCH",
|
|
|
+ },
|
|
|
+ )
|
|
|
+
|
|
|
+VCM_MAPPING = {
|
|
|
+ 'A':'N',
|
|
|
+ 'P':'N',
|
|
|
+ 'W':'C',
|
|
|
+ 'V':'C',
|
|
|
+ 'L':'L',
|
|
|
+ 'Y':'Y',
|
|
|
+ 'U':'U',
|
|
|
+ }
|
|
|
+XDS_MAPPING = {
|
|
|
+ 'T':'T',
|
|
|
+ 'C':'C',
|
|
|
+ 'B':'A,C',
|
|
|
+ 'A':'A',
|
|
|
+ 'P':'P',
|
|
|
+ 'O':'O',
|
|
|
+ 'U':'U',
|
|
|
+ }
|
|
|
+
|
|
|
+BP_RECS = ['77033_5/77033_5.WAV']
|
|
|
+def convert(filename: str, filter=None, **kwargs) -> pd.DataFrame:
|
|
|
+
|
|
|
+ eaf = pympi.Elan.Eaf(filename)
|
|
|
+
|
|
|
+ segments = {}
|
|
|
+ for tier_name in eaf.tiers:
|
|
|
+ print(tier_name)
|
|
|
+ annotations = eaf.tiers[tier_name][0]
|
|
|
+ if (
|
|
|
+ tier_name not in SPEAKER_ID_TO_TYPE
|
|
|
+ and len(annotations) > 0
|
|
|
+ ):
|
|
|
+ print(
|
|
|
+ "warning: unknown tier '{}' will be ignored in '{}'".format(
|
|
|
+ tier_name, filename
|
|
|
+ )
|
|
|
+ )
|
|
|
+ continue
|
|
|
+
|
|
|
+ for aid in annotations:
|
|
|
+ (start_ts, end_ts, value, svg_ref) = annotations[aid]
|
|
|
+ (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
|
|
|
+
|
|
|
+ segment = {
|
|
|
+ "segment_onset": int(round(start_t)),
|
|
|
+ "segment_offset": int(round(end_t)),
|
|
|
+ "speaker_id": tier_name,
|
|
|
+ "speaker_type": SPEAKER_ID_TO_TYPE[tier_name],
|
|
|
+ "vcm_type": "NA",
|
|
|
+ "vcm_type_precise": "NA",
|
|
|
+ "msc_type": "NA",
|
|
|
+ "xds_type": "NA",
|
|
|
+ "gra_type": "NA",
|
|
|
+ "addressee": "NA",
|
|
|
+ }
|
|
|
+
|
|
|
+ segments[aid] = segment
|
|
|
+
|
|
|
+ for tier_name in eaf.tiers:
|
|
|
+ if "@" in tier_name:
|
|
|
+ label, ref = tier_name.split("@")
|
|
|
+ else:
|
|
|
+ label, ref = tier_name, None
|
|
|
+
|
|
|
+ reference_annotations = eaf.tiers[tier_name][1]
|
|
|
+
|
|
|
+ if ref not in SPEAKER_ID_TO_TYPE:
|
|
|
+ continue
|
|
|
+
|
|
|
+ for aid in reference_annotations:
|
|
|
+ (ann, value, prev, svg) = reference_annotations[aid]
|
|
|
+
|
|
|
+ ann = aid
|
|
|
+ parentTier = eaf.tiers[eaf.annotations[ann]]
|
|
|
+ while (
|
|
|
+ "PARENT_REF" in parentTier[2]
|
|
|
+ and parentTier[2]["PARENT_REF"]
|
|
|
+ and len(parentTier[2]) > 0
|
|
|
+ ):
|
|
|
+ ann = parentTier[1][ann][0]
|
|
|
+ parentTier = eaf.tiers[eaf.annotations[ann]]
|
|
|
+
|
|
|
+ if ann not in segments:
|
|
|
+ print(
|
|
|
+ "warning: annotation '{}' not found in segments for '{}'".format(
|
|
|
+ ann, filename
|
|
|
+ )
|
|
|
+ )
|
|
|
+ continue
|
|
|
+
|
|
|
+ segment = segments[ann]
|
|
|
+
|
|
|
+ if value: #discard segments that have no label (kept NA)
|
|
|
+ if label == "vcm":
|
|
|
+ segment["vcm_type"] = VCM_MAPPING[value]
|
|
|
+ segment["vcm_type_precise"] = value
|
|
|
+ elif label == "msc":
|
|
|
+ segment["msc_type"] = value
|
|
|
+ elif label == "gra":
|
|
|
+ segment["gra_type"] = value
|
|
|
+ elif label == "xds":
|
|
|
+ segment["addressee"] = XDS_MAPPING[value]
|
|
|
+
|
|
|
+
|
|
|
+ return pd.DataFrame(segments.values())
|
|
|
+BP_REC = ['77033_5.eaf']
|
|
|
+chunk_break = 300000 #here put in miliseconds approximately how long is the shortest break between annotation chunks
|
|
|
+if __name__ == '__main__' :
|
|
|
+
|
|
|
+ project = ChildProject('.')
|
|
|
+ am = AnnotationManager(project)
|
|
|
+
|
|
|
+ files = pd.DataFrame([
|
|
|
+ {'raw_filename': f}
|
|
|
+ for f in glob.glob('./annotations/eaf_2023/ak/raw/*/*.eaf') if f.split('/')[-1] in BP_REC
|
|
|
+ ])
|
|
|
+
|
|
|
+ files['time_seek'] = 0
|
|
|
+ files['raw_filename'] = files['raw_filename'].apply(os.path.basename)
|
|
|
+ files['recording_filename'] = files['raw_filename'].apply(lambda x: x.split('.')[-2] + '/' + x.split('.')[-2] + '.WAV')
|
|
|
+ # files = files[files['recording_filename'].isin(project.recordings['recording_filename'])]
|
|
|
+ files['set'] = 'eaf_2023/ak'
|
|
|
+ files['format'] = 'eaf'
|
|
|
+ print(files)
|
|
|
+ _files = []
|
|
|
+
|
|
|
+ for f in files.to_dict(orient='records'):
|
|
|
+ eaf = pympi.Elan.Eaf(Path('./annotations') / 'eaf_2023' / 'ak' / 'raw' / f['raw_filename'].split('.')[0] / f['raw_filename'])
|
|
|
+
|
|
|
+ df = pd.DataFrame(columns=['range_onset', 'range_offset'])
|
|
|
+ eaf.get_full_time_interval()
|
|
|
+
|
|
|
+ for tier in eaf.get_tier_names():
|
|
|
+ for ann in eaf.get_annotation_data_for_tier(tier):
|
|
|
+ df2 = pd.DataFrame({'range_onset': ann[0], 'range_offset': ann[1]}, index=[0])
|
|
|
+ df = pd.concat([df, df2], ignore_index=True)
|
|
|
+ df = df.sort_values('range_onset').reset_index(drop=True)
|
|
|
+
|
|
|
+ dif_st = np.diff(df['range_onset'].to_numpy())
|
|
|
+ idx_st = [x + 1 for x, val in enumerate(dif_st) if val >= chunk_break]
|
|
|
+ start_times = df['range_onset'].to_numpy()[idx_st]
|
|
|
+ start_times = np.insert(start_times, 0, df['range_onset'].to_numpy()[0])
|
|
|
+
|
|
|
+ df = df.sort_values('range_offset').reset_index(drop=True)
|
|
|
+ dif_end = np.diff(df['range_offset'].to_numpy())
|
|
|
+
|
|
|
+ idx_end = [x for x, val in enumerate(dif_end) if val >= chunk_break]
|
|
|
+ end_times = df['range_offset'].to_numpy()[idx_end]
|
|
|
+ end_times = np.append(end_times, df['range_offset'].to_numpy()[-1])
|
|
|
+
|
|
|
+ final = pd.DataFrame(columns=['range_onset', 'range_offset'])
|
|
|
+ final['range_onset'] = start_times
|
|
|
+ final['range_offset'] = end_times
|
|
|
+ final['time_seek'] = 0
|
|
|
+ final['raw_filename'] = '77033_5/77033_5.eaf'
|
|
|
+ final['recording_filename'] = '77033_5/77033_5.WAV'
|
|
|
+ final['format'] = 'eaf'
|
|
|
+ final['set'] = 'eaf_2023/ak'
|
|
|
+
|
|
|
+
|
|
|
+ #_files.append(pd.DataFrame([f]))
|
|
|
+
|
|
|
+ # for tier_name in ['CHI', 'FEM', 'MAL', 'OCH']:
|
|
|
+ # portions = eaf.tiers[tier_name][0] #tier names
|
|
|
+ #
|
|
|
+ # for pid in portions:
|
|
|
+ # (start_ts, end_ts, value, svg_ref) = portions[pid]
|
|
|
+ # (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
|
|
|
+ #
|
|
|
+ # # if value.upper() != 'Y':
|
|
|
+ # # continue
|
|
|
+ # f['tier'] = tier_name
|
|
|
+ # f['range_onset'] = start_t
|
|
|
+ # f['range_offset'] = end_t
|
|
|
+ #
|
|
|
+ # _files.append(pd.DataFrame([f]))
|
|
|
+
|
|
|
+
|
|
|
+ #import_df = pd.concat(_files).reset_index(drop=True)
|
|
|
+ import_df = final.reset_index(drop=True)
|
|
|
+ print(import_df)
|
|
|
+
|
|
|
+ # import_df = project.recordings[['recording_filename', 'duration']]
|
|
|
+ # import_df = import_df[import_df["recording_filename"].isin(BP_RECS)] #only keep bp recs
|
|
|
+ # import_df.rename(columns={'duration':'range_offset'}, inplace=True)
|
|
|
+ #
|
|
|
+ # #import_df['set'] = 'eaf_2022/an1' # first batch
|
|
|
+ # import_df['set'] = 'eaf_2023/ak' #import bautista's annotations
|
|
|
+ #
|
|
|
+ # import_df['time_seek'] = 0
|
|
|
+ # import_df['range_onset'] = 0
|
|
|
+ # import_df['format'] = 'eaf'
|
|
|
+ #
|
|
|
+ # #import_df['raw_filename'] = import_df['recording_filename'].apply(lambda x: RECORDINGS_MAPPING[x])
|
|
|
+ # import_df['raw_filename'] = import_df['recording_filename'].apply(lambda x: os.path.basename(x.replace(".WAV",".eaf")))
|
|
|
+
|
|
|
+ #print(import_df)
|
|
|
+ am.import_annotations(import_df, threads=1, import_function=convert, overwrite_existing=True)
|
|
|
+
|