123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Created on Mon Jan 2 14:53:20 2023
- @author: lpeurey
- Manage the importation of eaf 2022 annotation campaign
- custom converter to import properly
- """
- import glob
- from pathlib import Path
- import numpy as np
- import pandas as pd
- import pympi
- from collections import defaultdict
- import os
- from ChildProject.projects import ChildProject
- from ChildProject.annotations import AnnotationManager
- SPEAKER_ID_TO_TYPE = defaultdict(
- lambda: "NA",
- {
- "CHI": "CHI",
- "FEM": "FEM",
- "MAL": "MAL",
- "OCH": "OCH",
- },
- )
- VCM_MAPPING = {
- 'A':'N',
- 'P':'N',
- 'W':'C',
- 'V':'C',
- 'L':'L',
- 'Y':'Y',
- 'U':'U',
- }
- XDS_MAPPING = {
- 'T':'T',
- 'C':'C',
- 'B':'A,C',
- 'A':'A',
- 'P':'P',
- 'O':'O',
- 'U':'U',
- }
- BP_RECS = ['77033_5/77033_5.WAV']
- def convert(filename: str, filter=None, **kwargs) -> pd.DataFrame:
- eaf = pympi.Elan.Eaf(filename)
-
- segments = {}
- for tier_name in eaf.tiers:
- print(tier_name)
- annotations = eaf.tiers[tier_name][0]
- if (
- tier_name not in SPEAKER_ID_TO_TYPE
- and len(annotations) > 0
- ):
- print(
- "warning: unknown tier '{}' will be ignored in '{}'".format(
- tier_name, filename
- )
- )
- continue
-
- for aid in annotations:
- (start_ts, end_ts, value, svg_ref) = annotations[aid]
- (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
-
- segment = {
- "segment_onset": int(round(start_t)),
- "segment_offset": int(round(end_t)),
- "speaker_id": tier_name,
- "speaker_type": SPEAKER_ID_TO_TYPE[tier_name],
- "vcm_type": "NA",
- "vcm_type_precise": "NA",
- "msc_type": "NA",
- "xds_type": "NA",
- "gra_type": "NA",
- "addressee": "NA",
- }
-
- segments[aid] = segment
-
- for tier_name in eaf.tiers:
- if "@" in tier_name:
- label, ref = tier_name.split("@")
- else:
- label, ref = tier_name, None
-
- reference_annotations = eaf.tiers[tier_name][1]
-
- if ref not in SPEAKER_ID_TO_TYPE:
- continue
-
- for aid in reference_annotations:
- (ann, value, prev, svg) = reference_annotations[aid]
-
- ann = aid
- parentTier = eaf.tiers[eaf.annotations[ann]]
- while (
- "PARENT_REF" in parentTier[2]
- and parentTier[2]["PARENT_REF"]
- and len(parentTier[2]) > 0
- ):
- ann = parentTier[1][ann][0]
- parentTier = eaf.tiers[eaf.annotations[ann]]
-
- if ann not in segments:
- print(
- "warning: annotation '{}' not found in segments for '{}'".format(
- ann, filename
- )
- )
- continue
-
- segment = segments[ann]
-
- if value: #discard segments that have no label (kept NA)
- if label == "vcm":
- segment["vcm_type"] = VCM_MAPPING[value]
- segment["vcm_type_precise"] = value
- elif label == "msc":
- segment["msc_type"] = value
- elif label == "gra":
- segment["gra_type"] = value
- elif label == "xds":
- segment["addressee"] = XDS_MAPPING[value]
-
-
- return pd.DataFrame(segments.values())
- BP_REC = ['77033_5.eaf']
- chunk_break = 300000 #here put in miliseconds approximately how long is the shortest break between annotation chunks
- if __name__ == '__main__' :
-
- project = ChildProject('.')
- am = AnnotationManager(project)
- files = pd.DataFrame([
- {'raw_filename': f}
- for f in glob.glob('./annotations/eaf_2023/ak/raw/*/*.eaf') if f.split('/')[-1] in BP_REC
- ])
- files['time_seek'] = 0
- files['raw_filename'] = files['raw_filename'].apply(os.path.basename)
- files['recording_filename'] = files['raw_filename'].apply(lambda x: x.split('.')[-2] + '/' + x.split('.')[-2] + '.WAV')
- # files = files[files['recording_filename'].isin(project.recordings['recording_filename'])]
- files['set'] = 'eaf_2023/ak'
- files['format'] = 'eaf'
- print(files)
- _files = []
- for f in files.to_dict(orient='records'):
- eaf = pympi.Elan.Eaf(Path('./annotations') / 'eaf_2023' / 'ak' / 'raw' / f['raw_filename'].split('.')[0] / f['raw_filename'])
- df = pd.DataFrame(columns=['range_onset', 'range_offset'])
- eaf.get_full_time_interval()
- for tier in eaf.get_tier_names():
- for ann in eaf.get_annotation_data_for_tier(tier):
- df2 = pd.DataFrame({'range_onset': ann[0], 'range_offset': ann[1]}, index=[0])
- df = pd.concat([df, df2], ignore_index=True)
- df = df.sort_values('range_onset').reset_index(drop=True)
- dif_st = np.diff(df['range_onset'].to_numpy())
- idx_st = [x + 1 for x, val in enumerate(dif_st) if val >= chunk_break]
- start_times = df['range_onset'].to_numpy()[idx_st]
- start_times = np.insert(start_times, 0, df['range_onset'].to_numpy()[0])
- df = df.sort_values('range_offset').reset_index(drop=True)
- dif_end = np.diff(df['range_offset'].to_numpy())
- idx_end = [x for x, val in enumerate(dif_end) if val >= chunk_break]
- end_times = df['range_offset'].to_numpy()[idx_end]
- end_times = np.append(end_times, df['range_offset'].to_numpy()[-1])
- final = pd.DataFrame(columns=['range_onset', 'range_offset'])
- final['range_onset'] = start_times
- final['range_offset'] = end_times
- final['time_seek'] = 0
- final['raw_filename'] = '77033_5/77033_5.eaf'
- final['recording_filename'] = '77033_5/77033_5.WAV'
- final['format'] = 'eaf'
- final['set'] = 'eaf_2023/ak'
- #_files.append(pd.DataFrame([f]))
- # for tier_name in ['CHI', 'FEM', 'MAL', 'OCH']:
- # portions = eaf.tiers[tier_name][0] #tier names
- #
- # for pid in portions:
- # (start_ts, end_ts, value, svg_ref) = portions[pid]
- # (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
- #
- # # if value.upper() != 'Y':
- # # continue
- # f['tier'] = tier_name
- # f['range_onset'] = start_t
- # f['range_offset'] = end_t
- #
- # _files.append(pd.DataFrame([f]))
- #import_df = pd.concat(_files).reset_index(drop=True)
- import_df = final.reset_index(drop=True)
- print(import_df)
-
- # import_df = project.recordings[['recording_filename', 'duration']]
- # import_df = import_df[import_df["recording_filename"].isin(BP_RECS)] #only keep bp recs
- # import_df.rename(columns={'duration':'range_offset'}, inplace=True)
- #
- # #import_df['set'] = 'eaf_2022/an1' # first batch
- # import_df['set'] = 'eaf_2023/ak' #import bautista's annotations
- #
- # import_df['time_seek'] = 0
- # import_df['range_onset'] = 0
- # import_df['format'] = 'eaf'
- #
- # #import_df['raw_filename'] = import_df['recording_filename'].apply(lambda x: RECORDINGS_MAPPING[x])
- # import_df['raw_filename'] = import_df['recording_filename'].apply(lambda x: os.path.basename(x.replace(".WAV",".eaf")))
- #print(import_df)
- am.import_annotations(import_df, threads=1, import_function=convert, overwrite_existing=True)
|