AgataKoziol 6 месяцев назад
Родитель
Сommit
da839ee3a4
37 измененных файлов с 447 добавлено и 0 удалено
  1. 1 0
      .DS_Store
  2. 1 0
      annotations/.DS_Store
  3. 1 0
      annotations/eaf_2023/.DS_Store
  4. 1 0
      annotations/eaf_2023/ak/.DS_Store
  5. 1 0
      annotations/eaf_2023/ak/converted/77033_5/77033_5_10614464_10682016.csv
  6. 1 0
      annotations/eaf_2023/ak/converted/77033_5/77033_5_14928930_14991430.csv
  7. 1 0
      annotations/eaf_2023/ak/converted/77033_5/77033_5_17240844_17312171.csv
  8. 1 0
      annotations/eaf_2023/ak/converted/77033_5/77033_5_1804637_1918913.csv
  9. 1 0
      annotations/eaf_2023/ak/converted/77033_5/77033_5_18623929_18668843.csv
  10. 1 0
      annotations/eaf_2023/ak/converted/77033_5/77033_5_19080362_19095206.csv
  11. 1 0
      annotations/eaf_2023/ak/converted/77033_5/77033_5_310_72488.csv
  12. 1 0
      annotations/eaf_2023/ak/converted/77033_5/77033_5_3378792_3441447.csv
  13. 1 0
      annotations/eaf_2023/ak/converted/77033_5/77033_5_7034603_7098981.csv
  14. 1 0
      annotations/eaf_2023/ak/converted/77033_5/77033_5_8343248_8406016.csv
  15. 1 0
      annotations/eaf_2023/ak/converted/77033_5/77033_5_899827_962068.csv
  16. 1 0
      annotations/eaf_2023/ak/raw/.DS_Store
  17. 1 0
      annotations/eaf_2023/ak/raw/77021_5/V20230127-070014.eaf
  18. 1 0
      annotations/eaf_2023/ak/raw/77021_5/V20230127-070014.pfsx
  19. 1 0
      annotations/eaf_2023/ak/raw/77033_5/.DS_Store
  20. 1 0
      annotations/eaf_2023/ak/raw/77033_5/77033_5.eaf
  21. 1 0
      annotations/eaf_2023/ak/raw/77033_5/77033_5.pfsx
  22. 1 0
      annotations/eaf_2023/polish_template.etf
  23. 12 0
      metadata/annotations.csv
  24. 2 0
      metadata/children.csv
  25. 2 0
      metadata/recordings.csv
  26. 1 0
      recordings/.DS_Store
  27. 1 0
      recordings/converted/.DS_Store
  28. 1 0
      recordings/converted/standard/.DS_Store
  29. 1 0
      recordings/converted/standard/77033_5/77033_5.wav
  30. 1 0
      recordings/converted/standard/parameters_20231022_154333.yml
  31. 1 0
      recordings/converted/standard/recordings.csv
  32. 1 0
      recordings/raw/.DS_Store
  33. 1 0
      recordings/raw/77033_5/77033_5.wav
  34. 44 0
      scripts/calculate_annotation_chunks.py
  35. 88 0
      scripts/confusion_matrix2.py
  36. 38 0
      scripts/import.py
  37. 231 0
      scripts/import_eaf_poland.py

+ 1 - 0
.DS_Store

@@ -0,0 +1 @@
+.git/annex/objects/w1/8z/MD5E-s8196--e4ad73ec75c7e39a8524034b8edb17cf/MD5E-s8196--e4ad73ec75c7e39a8524034b8edb17cf

+ 1 - 0
annotations/.DS_Store

@@ -0,0 +1 @@
+../.git/annex/objects/ZK/kz/MD5E-s6148--fbb42a91672c814bafa77c819961d737/MD5E-s6148--fbb42a91672c814bafa77c819961d737

+ 1 - 0
annotations/eaf_2023/.DS_Store

@@ -0,0 +1 @@
+../../.git/annex/objects/xQ/09/MD5E-s6148--2edf891008094080170efe3e57150f96/MD5E-s6148--2edf891008094080170efe3e57150f96

+ 1 - 0
annotations/eaf_2023/ak/.DS_Store

@@ -0,0 +1 @@
+../../../.git/annex/objects/vw/7M/MD5E-s6148--8b44f2d28f2a8a70bf265ce6a261ca0d/MD5E-s6148--8b44f2d28f2a8a70bf265ce6a261ca0d

+ 1 - 0
annotations/eaf_2023/ak/converted/77033_5/77033_5_10614464_10682016.csv

@@ -0,0 +1 @@
+../../../../../.git/annex/objects/34/fp/MD5E-s2049--e079d0128832ef5873d6cd05ec920d20.csv/MD5E-s2049--e079d0128832ef5873d6cd05ec920d20.csv

+ 1 - 0
annotations/eaf_2023/ak/converted/77033_5/77033_5_14928930_14991430.csv

@@ -0,0 +1 @@
+../../../../../.git/annex/objects/M1/gG/MD5E-s2689--22e7ba5ca977585a43773123ea189531.csv/MD5E-s2689--22e7ba5ca977585a43773123ea189531.csv

+ 1 - 0
annotations/eaf_2023/ak/converted/77033_5/77033_5_17240844_17312171.csv

@@ -0,0 +1 @@
+../../../../../.git/annex/objects/gJ/7Q/MD5E-s2241--fb605b52094c03679a225991b097a3e3.csv/MD5E-s2241--fb605b52094c03679a225991b097a3e3.csv

+ 1 - 0
annotations/eaf_2023/ak/converted/77033_5/77033_5_1804637_1918913.csv

@@ -0,0 +1 @@
+../../../../../.git/annex/objects/7G/Pf/MD5E-s4779--57a54837fc9a4437912122a8f8e88c35.csv/MD5E-s4779--57a54837fc9a4437912122a8f8e88c35.csv

+ 1 - 0
annotations/eaf_2023/ak/converted/77033_5/77033_5_18623929_18668843.csv

@@ -0,0 +1 @@
+../../../../../.git/annex/objects/8Z/kp/MD5E-s2177--615c45205e56fea6994f1baeccb63723.csv/MD5E-s2177--615c45205e56fea6994f1baeccb63723.csv

+ 1 - 0
annotations/eaf_2023/ak/converted/77033_5/77033_5_19080362_19095206.csv

@@ -0,0 +1 @@
+../../../../../.git/annex/objects/Kj/9z/MD5E-s577--f9d916bc51322fb0a114dcae246a96ae.csv/MD5E-s577--f9d916bc51322fb0a114dcae246a96ae.csv

+ 1 - 0
annotations/eaf_2023/ak/converted/77033_5/77033_5_310_72488.csv

@@ -0,0 +1 @@
+../../../../../.git/annex/objects/Kz/X2/MD5E-s2609--72a3f8b82e8895c885e86443f7fef461.csv/MD5E-s2609--72a3f8b82e8895c885e86443f7fef461.csv

+ 1 - 0
annotations/eaf_2023/ak/converted/77033_5/77033_5_3378792_3441447.csv

@@ -0,0 +1 @@
+../../../../../.git/annex/objects/3f/jk/MD5E-s2361--e41109442e447563f6d49fd11a1f7936.csv/MD5E-s2361--e41109442e447563f6d49fd11a1f7936.csv

+ 1 - 0
annotations/eaf_2023/ak/converted/77033_5/77033_5_7034603_7098981.csv

@@ -0,0 +1 @@
+../../../../../.git/annex/objects/Xf/0Q/MD5E-s1245--52c5e720c7eb9737c63e413556378d44.csv/MD5E-s1245--52c5e720c7eb9737c63e413556378d44.csv

+ 1 - 0
annotations/eaf_2023/ak/converted/77033_5/77033_5_8343248_8406016.csv

@@ -0,0 +1 @@
+../../../../../.git/annex/objects/2k/V4/MD5E-s2113--710ed966074c091673932ab4a9f037f0.csv/MD5E-s2113--710ed966074c091673932ab4a9f037f0.csv

+ 1 - 0
annotations/eaf_2023/ak/converted/77033_5/77033_5_899827_962068.csv

@@ -0,0 +1 @@
+../../../../../.git/annex/objects/13/Z7/MD5E-s1869--2a8e2b42ced0aa84aabd31ec8ed78297.csv/MD5E-s1869--2a8e2b42ced0aa84aabd31ec8ed78297.csv

+ 1 - 0
annotations/eaf_2023/ak/raw/.DS_Store

@@ -0,0 +1 @@
+../../../../.git/annex/objects/FG/5W/MD5E-s6148--e6d5af7e7f055c2713261621be7e5055/MD5E-s6148--e6d5af7e7f055c2713261621be7e5055

+ 1 - 0
annotations/eaf_2023/ak/raw/77021_5/V20230127-070014.eaf

@@ -0,0 +1 @@
+../../../../../.git/annex/objects/pv/X4/MD5E-s31408--fb21ed4acd689a0eff81fc7a4d085c34.eaf/MD5E-s31408--fb21ed4acd689a0eff81fc7a4d085c34.eaf

+ 1 - 0
annotations/eaf_2023/ak/raw/77021_5/V20230127-070014.pfsx

@@ -0,0 +1 @@
+../../../../../.git/annex/objects/6j/gv/MD5E-s4230--d38a3757f90e876076605307f6b929c2.pfsx/MD5E-s4230--d38a3757f90e876076605307f6b929c2.pfsx

+ 1 - 0
annotations/eaf_2023/ak/raw/77033_5/.DS_Store

@@ -0,0 +1 @@
+../../../../../.git/annex/objects/3m/P7/MD5E-s6148--6531a98d751bcbce45b10ffed6347fa8/MD5E-s6148--6531a98d751bcbce45b10ffed6347fa8

+ 1 - 0
annotations/eaf_2023/ak/raw/77033_5/77033_5.eaf

@@ -0,0 +1 @@
+../../../../../.git/annex/objects/xv/4G/MD5E-s142563--5fc93d341aa2d40b2c36396348bf7988.eaf/MD5E-s142563--5fc93d341aa2d40b2c36396348bf7988.eaf

+ 1 - 0
annotations/eaf_2023/ak/raw/77033_5/77033_5.pfsx

@@ -0,0 +1 @@
+../../../../../.git/annex/objects/88/02/MD5E-s4133--f96a429b914b12bb11b61067c90cc1da.pfsx/MD5E-s4133--f96a429b914b12bb11b61067c90cc1da.pfsx

+ 1 - 0
annotations/eaf_2023/polish_template.etf

@@ -0,0 +1 @@
+../../.git/annex/objects/P5/63/MD5E-s3546--c5001de7a055c9a9c8ad2ae1f08aee1e.etf/MD5E-s3546--c5001de7a055c9a9c8ad2ae1f08aee1e.etf

+ 12 - 0
metadata/annotations.csv

@@ -0,0 +1,12 @@
+set,recording_filename,time_seek,range_onset,range_offset,raw_filename,format,filter,annotation_filename,imported_at,package_version,error,merged_from
+eaf_2023/ak,77033_5/77033_5.WAV,0,10614464,10682016,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_10614464_10682016.csv,2023-10-22 15:45:26,0.1.1,,
+eaf_2023/ak,77033_5/77033_5.WAV,0,14928930,14991430,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_14928930_14991430.csv,2023-10-22 15:45:26,0.1.1,,
+eaf_2023/ak,77033_5/77033_5.WAV,0,17240844,17312171,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_17240844_17312171.csv,2023-10-22 15:45:26,0.1.1,,
+eaf_2023/ak,77033_5/77033_5.WAV,0,1804637,1918913,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_1804637_1918913.csv,2023-10-22 15:45:26,0.1.1,,
+eaf_2023/ak,77033_5/77033_5.WAV,0,18623929,18668843,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_18623929_18668843.csv,2023-10-22 15:45:26,0.1.1,,
+eaf_2023/ak,77033_5/77033_5.WAV,0,19080362,19095206,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_19080362_19095206.csv,2023-10-22 15:45:26,0.1.1,,
+eaf_2023/ak,77033_5/77033_5.WAV,0,310,72488,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_310_72488.csv,2023-10-22 15:45:26,0.1.1,,
+eaf_2023/ak,77033_5/77033_5.WAV,0,3378792,3441447,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_3378792_3441447.csv,2023-10-22 15:45:26,0.1.1,,
+eaf_2023/ak,77033_5/77033_5.WAV,0,7034603,7098981,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_7034603_7098981.csv,2023-10-22 15:45:26,0.1.1,,
+eaf_2023/ak,77033_5/77033_5.WAV,0,8343248,8406016,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_8343248_8406016.csv,2023-10-22 15:45:26,0.1.1,,
+eaf_2023/ak,77033_5/77033_5.WAV,0,899827,962068,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_899827_962068.csv,2023-10-22 15:45:26,0.1.1,,

+ 2 - 0
metadata/children.csv

@@ -0,0 +1,2 @@
+child_id,experiment,child_dob,location_id,child_sex,language,monoling,languages,mat_ed,fat_ed,monoling_criterion,n_of_siblings,household_size,dob_criterion,dob_accuracy,discard
+77033_5,MOVIN,2021-02-08,warsaw,m,polish,Y,,17,17,“we asked families which languages they spoke in the home”,0,3,exact,day,0

+ 2 - 0
metadata/recordings.csv

@@ -0,0 +1,2 @@
+experiment,child_id,date_iso,start_time,start_time_accuracy,recording_device_type,recording_filename,location_id,duration
+MOVIN,77033_5,2023-04-14,NA,hour,usb,77033_5/77033_5.WAV,warsaw,19100960

+ 1 - 0
recordings/.DS_Store

@@ -0,0 +1 @@
+../.git/annex/objects/1J/83/MD5E-s6148--41244a732097548580484001c3dd5fa2/MD5E-s6148--41244a732097548580484001c3dd5fa2

+ 1 - 0
recordings/converted/.DS_Store

@@ -0,0 +1 @@
+../../.git/annex/objects/0V/Zp/MD5E-s6148--689f59669593f9c6162bbe3e14168f7e/MD5E-s6148--689f59669593f9c6162bbe3e14168f7e

+ 1 - 0
recordings/converted/standard/.DS_Store

@@ -0,0 +1 @@
+../../../.git/annex/objects/85/3v/MD5E-s6148--f04bd34aa70c217c85aedec2e23603a0/MD5E-s6148--f04bd34aa70c217c85aedec2e23603a0

+ 1 - 0
recordings/converted/standard/77033_5/77033_5.wav

@@ -0,0 +1 @@
+../../../../.git/annex/objects/Z6/kM/MD5E-s1222461518--61eaea172b360dd9cbe00ab30c098738.wav/MD5E-s1222461518--61eaea172b360dd9cbe00ab30c098738.wav

+ 1 - 0
recordings/converted/standard/parameters_20231022_154333.yml

@@ -0,0 +1 @@
+../../../.git/annex/objects/0k/q8/MD5E-s283--3f43e34b993dfc2955b4aec6317c980f.yml/MD5E-s283--3f43e34b993dfc2955b4aec6317c980f.yml

+ 1 - 0
recordings/converted/standard/recordings.csv

@@ -0,0 +1 @@
+../../../.git/annex/objects/fG/W1/MD5E-s266--5b78a6f73f034e09ad02ba52f0b31fdf.csv/MD5E-s266--5b78a6f73f034e09ad02ba52f0b31fdf.csv

+ 1 - 0
recordings/raw/.DS_Store

@@ -0,0 +1 @@
+../../.git/annex/objects/ZG/g1/MD5E-s6148--3657552f3b819a7a1ab13b2bcebc843b/MD5E-s6148--3657552f3b819a7a1ab13b2bcebc843b

+ 1 - 0
recordings/raw/77033_5/77033_5.wav

@@ -0,0 +1 @@
+../../../.git/annex/objects/j2/6G/MD5E-s5501080576--495c039cbf09501b1178d68d48d1cd60.wav/MD5E-s5501080576--495c039cbf09501b1178d68d48d1cd60.wav

+ 44 - 0
scripts/calculate_annotation_chunks.py

@@ -0,0 +1,44 @@
+import glob
+import os
+import numpy as np
+import pandas as pd
+import pympi
+parent_directory = r"/Users/agatka/Library/CloudStorage/GoogleDrive-akoziol@sd.psych.pan.pl/Mój dysk/childproject-dataset/polish-dataset/annotations/"
+os.chdir(parent_directory)
+print("Current working directory: {0}".format(os.getcwd()))
+chunk_break = 300000 #here put in miliseconds approximately how long is the shortest break between annotation chunks
+for i, file in enumerate(glob.glob(os.path.join(parent_directory, '**'), recursive=True)):
+    if os.path.isfile(os.path.join(parent_directory, file)):
+        ext = file.split('.')
+        if ext[-1] == 'eaf':
+            print(ext[0].split('/')[-1])
+            file_name = ext[0].split('/')[-1]
+            elan_file = pympi.Elan.Eaf(file)
+            #elan_data[file_name] = elan_file
+            df = pd.DataFrame(columns=['StartTime', 'EndTime'])
+            elan_file.get_full_time_interval()
+
+            for tier in elan_file.get_tier_names():
+                for ann in elan_file.get_annotation_data_for_tier(tier):
+                    df2 = pd.DataFrame({'StartTime': ann[0], 'EndTime': ann[1]}, index=[0])
+                    df = pd.concat([df, df2], ignore_index=True)
+            df = df.sort_values('StartTime').reset_index(drop=True)
+            
+            dif_st = np.diff(df['StartTime'].to_numpy())
+            idx_st = [x+1 for x, val in enumerate(dif_st) if val >= chunk_break]
+            start_times = df['StartTime'].to_numpy()[idx_st]
+            start_times = np.insert(start_times, 0, df['StartTime'].to_numpy()[0])
+
+            df = df.sort_values('EndTime').reset_index(drop=True)
+            dif_end = np.diff(df['EndTime'].to_numpy())
+
+            idx_end = [x for x, val in enumerate(dif_end) if val >= chunk_break]
+            end_times = df['EndTime'].to_numpy()[idx_end]
+            end_times = np.append(end_times, df['EndTime'].to_numpy()[-1])
+
+            final = pd.DataFrame(columns=['StartTime', 'EndTime'])
+            final['StartTime'] = start_times
+            final['EndTime'] = end_times
+            print(final)
+            os.makedirs(parent_directory+'/annotation_chunks/{0}/'.format(file_name), exist_ok=True)
+            final.to_csv(parent_directory+'/annotation_chunks/{0}/{0}.csv'.format(file_name))

+ 88 - 0
scripts/confusion_matrix2.py

@@ -0,0 +1,88 @@
+from ChildProject.projects import ChildProject
+from ChildProject.annotations import AnnotationManager
+from ChildProject.metrics import segments_to_grid, conf_matrix
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+speakers = ['CHI', 'OCH', 'FEM', 'MAL'] #PUT HERE THE LABELS YOU WANT TO INCLUDE
+project = ChildProject('.')
+am = AnnotationManager(project)
+am.read()
+
+SET_1 = 'eaf_2023/ak' #CHANGE THE FOLDER TO WHERE THE MANUAL ANNOTATIONS ARE
+SET_2 = 'vtc' #CHANGE THE FOLDER TO WHERE VTC GENERATED ANNOTATIONS ARE
+
+intersection = AnnotationManager.intersection(am.annotations, [SET_1, SET_2])
+
+segments = am.get_collapsed_segments(intersection)
+segments = segments[segments['speaker_type'].isin(speakers)]
+
+# Y
+#vtc = segments_to_grid(segments[segments['set'] == 'vtc'], 0, segments['segment_offset'].max(), 100, 'speaker_type', speakers,none = False)
+vtc = segments_to_grid(segments[segments['set'] == SET_1], 0, segments['segment_offset'].max(), 100, 'speaker_type', speakers)
+
+# X
+#its = segments_to_grid(segments[segments['set'] == 'its'], 0, segments['segment_offset'].max(), 100, 'speaker_type', speakers,none = False)
+its = segments_to_grid(segments[segments['set'] == SET_2], 0, segments['segment_offset'].max(), 100, 'speaker_type', speakers)
+
+confusion_counts = conf_matrix(vtc, its)
+all_positive = np.delete(confusion_counts, -1, 0)
+all_negative = np.delete(confusion_counts, -1, 1)
+
+precision = np.delete(all_negative, -1, 0).trace() / all_positive.sum()
+recall = np.delete(all_negative, -1, 0).trace() / all_negative.sum()
+fscore = (2 * precision * recall) / (precision + recall)
+
+scores = {}
+i=0
+
+with open('scores.txt','w') as f:
+    for label in speakers:
+        rec = confusion_counts[i,i] / confusion_counts[ :,i].sum()
+        preci = confusion_counts[i,i] / confusion_counts[i,: ].sum()
+        fsc = (2 * preci * rec) / (preci + rec)
+        #scores[label] = (preci, rec, fsc)
+        f.write(f"{label}: precision {preci}; recall {rec}; F-score {fsc}\n")
+        i+=1
+
+    f.write(f"General: precision {precision}; recall {recall}; F-score {fscore}\n")
+    #print(f"General: precision {precision}; recall {recall}; F-score {fscore}")
+
+print(f"Results written to scores.txt")
+
+normalized = confusion_counts
+
+speakers.append("None")
+speakers = [""] + speakers
+
+fig, ax = plt.subplots(figsize=(7.5, 7.5))
+ax.set_xticklabels(speakers)  
+ax.set_yticklabels(speakers)
+ax.matshow(normalized, cmap=plt.cm.Blues, alpha=0.3)
+for i in range(normalized.shape[0]):
+    for j in range(normalized.shape[1]):
+        ax.text(x=j, y=i,s=round(normalized[i, j],3), va='center', ha='center', size='xx-large')
+ 
+ax.xaxis.set_label_position("top")
+# set Y and X
+plt.ylabel(SET_1, fontsize=18)
+plt.xlabel(SET_2, fontsize=18)
+plt.title('Confusion Matrix', fontsize=18)
+plt.savefig('conf_matrix.png')
+
+normalized = confusion_counts/(np.sum(vtc, axis = 0)[:,None])
+
+fig, ax = plt.subplots(figsize=(7.5, 7.5))
+ax.set_xticklabels(speakers)  
+ax.set_yticklabels(speakers)
+ax.matshow(normalized, cmap=plt.cm.Blues, alpha=0.3)
+for i in range(normalized.shape[0]):
+    for j in range(normalized.shape[1]):
+        ax.text(x=j, y=i,s=round(normalized[i, j],3), va='center', ha='center', size='xx-large')
+ 
+ax.xaxis.set_label_position("top")
+plt.ylabel(SET_1, fontsize=18)
+plt.xlabel(SET_2, fontsize=18)
+plt.title('Confusion Matrix', fontsize=18)
+plt.savefig('conf_matrix_normalized.png')

+ 38 - 0
scripts/import.py

@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+import pandas as pd
+import os
+from ChildProject.projects import ChildProject
+from ChildProject.annotations import AnnotationManager
+
+dataset_path = "."
+### !!!! EDIT THIS SECTION !!!! ### UNCOMMENT ONE AT A TIME AND RUN ONE AT A TIME
+#annot_type = {"set":"vtc","file_extension":".rttm","format":"vtc_rttm"} # UNCOMMENT FOR VTC ANNOTATIONS
+annot_type = {"set":"alice/output","file_extension":".txt","format":"alice"} # UNCOMMENT FOR ALICE ANNOTATIONS
+#annot_type = {"set":"vcm","file_extension":".vcm","format":"vcm_rttm"} # UNCOMMENT FOR VCM ANNOTATIONS
+###################################
+
+#load the project
+project = ChildProject(dataset_path)
+# load the annotation manager for our project
+am = AnnotationManager(project)
+
+# we take a copy of the recordings.csv file of the dataset, that suits us because we have one importation per recording, as is usually the case with automated annotations
+input_frame = pd.DataFrame.copy(project.recordings)
+
+# let's drop every column that we don't need
+input_frame.drop(['experiment', 'child_id', 'date_iso', 'start_time', 'recording_device_type'], axis = 1, inplace = True)
+
+#make sure that the duration for the recordings is set in recordings.csv, otherwise run child-project compute-durations /path
+
+input_frame["raw_filename"]= input_frame.apply(lambda row: os.path.splitext(row["recording_filename"])[0] + annot_type["file_extension"], axis=1) #take the name of the audio and add extension of the annotation (so this assumes the annotation file has the same name as the audio appart from extension)
+input_frame["set"] = annot_type["set"] #set to import to
+input_frame["format"] = annot_type["format"] #format of the annotation
+input_frame["time_seek"] = "0" # timestamps in the file don't need to be shifted
+input_frame["range_onset"] = "0" #from the start of the audio              ...
+input_frame["range_offset"] = input_frame["duration"] # ...to the end
+
+BP_RECS = ['77033_5/77033_5.WAV']
+input_frame = input_frame[input_frame["recording_filename"].isin(BP_RECS)] #only keep bp recs
+input_frame = input_frame.drop(columns=['duration'])
+
+am.import_annotations(input_frame)

+ 231 - 0
scripts/import_eaf_poland.py

@@ -0,0 +1,231 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jan  2 14:53:20 2023
+
+@author: lpeurey
+
+Manage  the importation of eaf 2022 annotation campaign
+custom converter to import properly
+"""
+import glob
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pympi
+from collections import defaultdict
+import os
+
+from ChildProject.projects import ChildProject
+from ChildProject.annotations import AnnotationManager
+
+SPEAKER_ID_TO_TYPE = defaultdict(
+        lambda: "NA",
+        {
+            "CHI": "CHI",
+            "FEM": "FEM",
+            "MAL": "MAL",
+            "OCH": "OCH",
+        },
+    )
+
+VCM_MAPPING = {
+        'A':'N',
+        'P':'N',
+        'W':'C',
+        'V':'C',
+        'L':'L',
+        'Y':'Y',
+        'U':'U',
+        }
+XDS_MAPPING = {
+        'T':'T',
+        'C':'C',
+        'B':'A,C',
+        'A':'A',
+        'P':'P',
+        'O':'O',
+        'U':'U',
+        }
+
+BP_RECS = ['77033_5/77033_5.WAV']
+def convert(filename: str, filter=None, **kwargs) -> pd.DataFrame:
+
+    eaf = pympi.Elan.Eaf(filename)
+    
+    segments = {}
+    for tier_name in eaf.tiers:
+        print(tier_name)
+        annotations = eaf.tiers[tier_name][0]
+        if (
+            tier_name not in SPEAKER_ID_TO_TYPE
+            and len(annotations) > 0
+        ):
+            print(
+                "warning: unknown tier '{}' will be ignored in '{}'".format(
+                    tier_name, filename
+                )
+            )
+            continue
+    
+        for aid in annotations:
+            (start_ts, end_ts, value, svg_ref) = annotations[aid]
+            (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
+    
+            segment = {
+                "segment_onset": int(round(start_t)),
+                "segment_offset": int(round(end_t)),
+                "speaker_id": tier_name,
+                "speaker_type": SPEAKER_ID_TO_TYPE[tier_name],
+                "vcm_type": "NA",
+                "vcm_type_precise": "NA",
+                "msc_type": "NA",
+                "xds_type": "NA",
+                "gra_type": "NA",
+                "addressee": "NA",
+            }
+    
+            segments[aid] = segment
+    
+    for tier_name in eaf.tiers:
+        if "@" in tier_name:
+            label, ref = tier_name.split("@")
+        else:
+            label, ref = tier_name, None
+    
+        reference_annotations = eaf.tiers[tier_name][1]
+    
+        if ref not in SPEAKER_ID_TO_TYPE:
+            continue
+    
+        for aid in reference_annotations:
+            (ann, value, prev, svg) = reference_annotations[aid]
+    
+            ann = aid
+            parentTier = eaf.tiers[eaf.annotations[ann]]
+            while (
+                "PARENT_REF" in parentTier[2]
+                and parentTier[2]["PARENT_REF"]
+                and len(parentTier[2]) > 0
+            ):
+                ann = parentTier[1][ann][0]
+                parentTier = eaf.tiers[eaf.annotations[ann]]
+    
+            if ann not in segments:
+                print(
+                    "warning: annotation '{}' not found in segments for '{}'".format(
+                        ann, filename
+                    )
+                )
+                continue
+    
+            segment = segments[ann]
+    
+            if value: #discard segments that have no label (kept NA)
+                if label == "vcm":
+                    segment["vcm_type"] = VCM_MAPPING[value]
+                    segment["vcm_type_precise"] = value
+                elif label == "msc":
+                    segment["msc_type"] = value
+                elif label == "gra":
+                    segment["gra_type"] = value
+                elif label == "xds":
+                    segment["addressee"] = XDS_MAPPING[value]
+                
+    
+    return pd.DataFrame(segments.values())
+BP_REC = ['77033_5.eaf']
+chunk_break = 300000 #here put in miliseconds approximately how long is the shortest break between annotation chunks
+if __name__ == '__main__' :
+    
+    project = ChildProject('.')
+    am = AnnotationManager(project)
+
+    files = pd.DataFrame([
+        {'raw_filename': f}
+        for f in glob.glob('./annotations/eaf_2023/ak/raw/*/*.eaf') if f.split('/')[-1] in BP_REC
+    ])
+
+    files['time_seek'] = 0
+    files['raw_filename'] = files['raw_filename'].apply(os.path.basename)
+    files['recording_filename'] = files['raw_filename'].apply(lambda x: x.split('.')[-2] + '/' + x.split('.')[-2] + '.WAV')
+    # files = files[files['recording_filename'].isin(project.recordings['recording_filename'])]
+    files['set'] = 'eaf_2023/ak'
+    files['format'] = 'eaf'
+    print(files)
+    _files = []
+
+    for f in files.to_dict(orient='records'):
+        eaf = pympi.Elan.Eaf(Path('./annotations') / 'eaf_2023' / 'ak' / 'raw' / f['raw_filename'].split('.')[0] / f['raw_filename'])
+
+        df = pd.DataFrame(columns=['range_onset', 'range_offset'])
+        eaf.get_full_time_interval()
+
+        for tier in eaf.get_tier_names():
+            for ann in eaf.get_annotation_data_for_tier(tier):
+                df2 = pd.DataFrame({'range_onset': ann[0], 'range_offset': ann[1]}, index=[0])
+                df = pd.concat([df, df2], ignore_index=True)
+        df = df.sort_values('range_onset').reset_index(drop=True)
+
+        dif_st = np.diff(df['range_onset'].to_numpy())
+        idx_st = [x + 1 for x, val in enumerate(dif_st) if val >= chunk_break]
+        start_times = df['range_onset'].to_numpy()[idx_st]
+        start_times = np.insert(start_times, 0, df['range_onset'].to_numpy()[0])
+
+        df = df.sort_values('range_offset').reset_index(drop=True)
+        dif_end = np.diff(df['range_offset'].to_numpy())
+
+        idx_end = [x for x, val in enumerate(dif_end) if val >= chunk_break]
+        end_times = df['range_offset'].to_numpy()[idx_end]
+        end_times = np.append(end_times, df['range_offset'].to_numpy()[-1])
+
+        final = pd.DataFrame(columns=['range_onset', 'range_offset'])
+        final['range_onset'] = start_times
+        final['range_offset'] = end_times
+        final['time_seek'] = 0
+        final['raw_filename'] = '77033_5/77033_5.eaf'
+        final['recording_filename'] = '77033_5/77033_5.WAV'
+        final['format'] = 'eaf'
+        final['set'] = 'eaf_2023/ak'
+
+
+        #_files.append(pd.DataFrame([f]))
+
+        # for tier_name in ['CHI', 'FEM', 'MAL', 'OCH']:
+        #     portions = eaf.tiers[tier_name][0] #tier names
+        #
+        #     for pid in portions:
+        #         (start_ts, end_ts, value, svg_ref) = portions[pid]
+        #         (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
+        #
+        #         # if value.upper() != 'Y':
+        #         #    continue
+        #         f['tier'] = tier_name
+        #         f['range_onset'] = start_t
+        #         f['range_offset'] = end_t
+        #
+        #         _files.append(pd.DataFrame([f]))
+
+
+    #import_df = pd.concat(_files).reset_index(drop=True)
+    import_df = final.reset_index(drop=True)
+    print(import_df)
+    
+    # import_df = project.recordings[['recording_filename', 'duration']]
+    # import_df = import_df[import_df["recording_filename"].isin(BP_RECS)] #only keep bp recs
+    # import_df.rename(columns={'duration':'range_offset'}, inplace=True)
+    #
+    # #import_df['set'] = 'eaf_2022/an1' # first batch
+    # import_df['set'] = 'eaf_2023/ak' #import bautista's annotations
+    #
+    # import_df['time_seek'] = 0
+    # import_df['range_onset'] = 0
+    # import_df['format'] = 'eaf'
+    #
+    # #import_df['raw_filename'] = import_df['recording_filename'].apply(lambda x: RECORDINGS_MAPPING[x])
+    # import_df['raw_filename'] = import_df['recording_filename'].apply(lambda x: os.path.basename(x.replace(".WAV",".eaf")))
+
+    #print(import_df)
+    am.import_annotations(import_df, threads=1, import_function=convert, overwrite_existing=True)
+