6 месяцев назад · da839ee3a4
--- a/.DS_Store
+++ b/.DS_Store
@@ -0,0 +1 @@
 
				+.git/annex/objects/w1/8z/MD5E-s8196--e4ad73ec75c7e39a8524034b8edb17cf/MD5E-s8196--e4ad73ec75c7e39a8524034b8edb17cf
			
--- a/annotations/.DS_Store
+++ b/annotations/.DS_Store
@@ -0,0 +1 @@
 
				+../.git/annex/objects/ZK/kz/MD5E-s6148--fbb42a91672c814bafa77c819961d737/MD5E-s6148--fbb42a91672c814bafa77c819961d737
			
--- a/annotations/eaf_2023/.DS_Store
+++ b/annotations/eaf_2023/.DS_Store
@@ -0,0 +1 @@
 
				+../../.git/annex/objects/xQ/09/MD5E-s6148--2edf891008094080170efe3e57150f96/MD5E-s6148--2edf891008094080170efe3e57150f96
			
--- a/annotations/eaf_2023/ak/.DS_Store
+++ b/annotations/eaf_2023/ak/.DS_Store
@@ -0,0 +1 @@
 
				+../../../.git/annex/objects/vw/7M/MD5E-s6148--8b44f2d28f2a8a70bf265ce6a261ca0d/MD5E-s6148--8b44f2d28f2a8a70bf265ce6a261ca0d
			
--- a/annotations/eaf_2023/ak/converted/77033_5/77033_5_10614464_10682016.csv
+++ b/annotations/eaf_2023/ak/converted/77033_5/77033_5_10614464_10682016.csv
@@ -0,0 +1 @@
 
				+../../../../../.git/annex/objects/34/fp/MD5E-s2049--e079d0128832ef5873d6cd05ec920d20.csv/MD5E-s2049--e079d0128832ef5873d6cd05ec920d20.csv
			
--- a/annotations/eaf_2023/ak/converted/77033_5/77033_5_14928930_14991430.csv
+++ b/annotations/eaf_2023/ak/converted/77033_5/77033_5_14928930_14991430.csv
@@ -0,0 +1 @@
 
				+../../../../../.git/annex/objects/M1/gG/MD5E-s2689--22e7ba5ca977585a43773123ea189531.csv/MD5E-s2689--22e7ba5ca977585a43773123ea189531.csv
			
--- a/annotations/eaf_2023/ak/converted/77033_5/77033_5_17240844_17312171.csv
+++ b/annotations/eaf_2023/ak/converted/77033_5/77033_5_17240844_17312171.csv
@@ -0,0 +1 @@
 
				+../../../../../.git/annex/objects/gJ/7Q/MD5E-s2241--fb605b52094c03679a225991b097a3e3.csv/MD5E-s2241--fb605b52094c03679a225991b097a3e3.csv
			
--- a/annotations/eaf_2023/ak/converted/77033_5/77033_5_1804637_1918913.csv
+++ b/annotations/eaf_2023/ak/converted/77033_5/77033_5_1804637_1918913.csv
@@ -0,0 +1 @@
 
				+../../../../../.git/annex/objects/7G/Pf/MD5E-s4779--57a54837fc9a4437912122a8f8e88c35.csv/MD5E-s4779--57a54837fc9a4437912122a8f8e88c35.csv
			
--- a/annotations/eaf_2023/ak/converted/77033_5/77033_5_18623929_18668843.csv
+++ b/annotations/eaf_2023/ak/converted/77033_5/77033_5_18623929_18668843.csv
@@ -0,0 +1 @@
 
				+../../../../../.git/annex/objects/8Z/kp/MD5E-s2177--615c45205e56fea6994f1baeccb63723.csv/MD5E-s2177--615c45205e56fea6994f1baeccb63723.csv
			
--- a/annotations/eaf_2023/ak/converted/77033_5/77033_5_19080362_19095206.csv
+++ b/annotations/eaf_2023/ak/converted/77033_5/77033_5_19080362_19095206.csv
@@ -0,0 +1 @@
 
				+../../../../../.git/annex/objects/Kj/9z/MD5E-s577--f9d916bc51322fb0a114dcae246a96ae.csv/MD5E-s577--f9d916bc51322fb0a114dcae246a96ae.csv
			
--- a/annotations/eaf_2023/ak/converted/77033_5/77033_5_310_72488.csv
+++ b/annotations/eaf_2023/ak/converted/77033_5/77033_5_310_72488.csv
@@ -0,0 +1 @@
 
				+../../../../../.git/annex/objects/Kz/X2/MD5E-s2609--72a3f8b82e8895c885e86443f7fef461.csv/MD5E-s2609--72a3f8b82e8895c885e86443f7fef461.csv
			
--- a/annotations/eaf_2023/ak/converted/77033_5/77033_5_3378792_3441447.csv
+++ b/annotations/eaf_2023/ak/converted/77033_5/77033_5_3378792_3441447.csv
@@ -0,0 +1 @@
 
				+../../../../../.git/annex/objects/3f/jk/MD5E-s2361--e41109442e447563f6d49fd11a1f7936.csv/MD5E-s2361--e41109442e447563f6d49fd11a1f7936.csv
			
--- a/annotations/eaf_2023/ak/converted/77033_5/77033_5_7034603_7098981.csv
+++ b/annotations/eaf_2023/ak/converted/77033_5/77033_5_7034603_7098981.csv
@@ -0,0 +1 @@
 
				+../../../../../.git/annex/objects/Xf/0Q/MD5E-s1245--52c5e720c7eb9737c63e413556378d44.csv/MD5E-s1245--52c5e720c7eb9737c63e413556378d44.csv
			
--- a/annotations/eaf_2023/ak/converted/77033_5/77033_5_8343248_8406016.csv
+++ b/annotations/eaf_2023/ak/converted/77033_5/77033_5_8343248_8406016.csv
@@ -0,0 +1 @@
 
				+../../../../../.git/annex/objects/2k/V4/MD5E-s2113--710ed966074c091673932ab4a9f037f0.csv/MD5E-s2113--710ed966074c091673932ab4a9f037f0.csv
			
--- a/annotations/eaf_2023/ak/converted/77033_5/77033_5_899827_962068.csv
+++ b/annotations/eaf_2023/ak/converted/77033_5/77033_5_899827_962068.csv
@@ -0,0 +1 @@
 
				+../../../../../.git/annex/objects/13/Z7/MD5E-s1869--2a8e2b42ced0aa84aabd31ec8ed78297.csv/MD5E-s1869--2a8e2b42ced0aa84aabd31ec8ed78297.csv
			
--- a/annotations/eaf_2023/ak/raw/.DS_Store
+++ b/annotations/eaf_2023/ak/raw/.DS_Store
@@ -0,0 +1 @@
 
				+../../../../.git/annex/objects/FG/5W/MD5E-s6148--e6d5af7e7f055c2713261621be7e5055/MD5E-s6148--e6d5af7e7f055c2713261621be7e5055
			
--- a/annotations/eaf_2023/ak/raw/77021_5/V20230127-070014.eaf
+++ b/annotations/eaf_2023/ak/raw/77021_5/V20230127-070014.eaf
@@ -0,0 +1 @@
 
				+../../../../../.git/annex/objects/pv/X4/MD5E-s31408--fb21ed4acd689a0eff81fc7a4d085c34.eaf/MD5E-s31408--fb21ed4acd689a0eff81fc7a4d085c34.eaf
			
--- a/annotations/eaf_2023/ak/raw/77021_5/V20230127-070014.pfsx
+++ b/annotations/eaf_2023/ak/raw/77021_5/V20230127-070014.pfsx
@@ -0,0 +1 @@
 
				+../../../../../.git/annex/objects/6j/gv/MD5E-s4230--d38a3757f90e876076605307f6b929c2.pfsx/MD5E-s4230--d38a3757f90e876076605307f6b929c2.pfsx
			
--- a/annotations/eaf_2023/ak/raw/77033_5/.DS_Store
+++ b/annotations/eaf_2023/ak/raw/77033_5/.DS_Store
@@ -0,0 +1 @@
 
				+../../../../../.git/annex/objects/3m/P7/MD5E-s6148--6531a98d751bcbce45b10ffed6347fa8/MD5E-s6148--6531a98d751bcbce45b10ffed6347fa8
			
--- a/annotations/eaf_2023/ak/raw/77033_5/77033_5.eaf
+++ b/annotations/eaf_2023/ak/raw/77033_5/77033_5.eaf
@@ -0,0 +1 @@
 
				+../../../../../.git/annex/objects/xv/4G/MD5E-s142563--5fc93d341aa2d40b2c36396348bf7988.eaf/MD5E-s142563--5fc93d341aa2d40b2c36396348bf7988.eaf
			
--- a/annotations/eaf_2023/ak/raw/77033_5/77033_5.pfsx
+++ b/annotations/eaf_2023/ak/raw/77033_5/77033_5.pfsx
@@ -0,0 +1 @@
 
				+../../../../../.git/annex/objects/88/02/MD5E-s4133--f96a429b914b12bb11b61067c90cc1da.pfsx/MD5E-s4133--f96a429b914b12bb11b61067c90cc1da.pfsx
			
--- a/annotations/eaf_2023/polish_template.etf
+++ b/annotations/eaf_2023/polish_template.etf
@@ -0,0 +1 @@
 
				+../../.git/annex/objects/P5/63/MD5E-s3546--c5001de7a055c9a9c8ad2ae1f08aee1e.etf/MD5E-s3546--c5001de7a055c9a9c8ad2ae1f08aee1e.etf
			
--- a/metadata/annotations.csv
+++ b/metadata/annotations.csv
@@ -0,0 +1,12 @@
 
				+set,recording_filename,time_seek,range_onset,range_offset,raw_filename,format,filter,annotation_filename,imported_at,package_version,error,merged_from
			
 
				+eaf_2023/ak,77033_5/77033_5.WAV,0,10614464,10682016,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_10614464_10682016.csv,2023-10-22 15:45:26,0.1.1,,
			
 
				+eaf_2023/ak,77033_5/77033_5.WAV,0,14928930,14991430,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_14928930_14991430.csv,2023-10-22 15:45:26,0.1.1,,
			
 
				+eaf_2023/ak,77033_5/77033_5.WAV,0,17240844,17312171,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_17240844_17312171.csv,2023-10-22 15:45:26,0.1.1,,
			
 
				+eaf_2023/ak,77033_5/77033_5.WAV,0,1804637,1918913,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_1804637_1918913.csv,2023-10-22 15:45:26,0.1.1,,
			
 
				+eaf_2023/ak,77033_5/77033_5.WAV,0,18623929,18668843,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_18623929_18668843.csv,2023-10-22 15:45:26,0.1.1,,
			
 
				+eaf_2023/ak,77033_5/77033_5.WAV,0,19080362,19095206,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_19080362_19095206.csv,2023-10-22 15:45:26,0.1.1,,
			
 
				+eaf_2023/ak,77033_5/77033_5.WAV,0,310,72488,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_310_72488.csv,2023-10-22 15:45:26,0.1.1,,
			
 
				+eaf_2023/ak,77033_5/77033_5.WAV,0,3378792,3441447,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_3378792_3441447.csv,2023-10-22 15:45:26,0.1.1,,
			
 
				+eaf_2023/ak,77033_5/77033_5.WAV,0,7034603,7098981,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_7034603_7098981.csv,2023-10-22 15:45:26,0.1.1,,
			
 
				+eaf_2023/ak,77033_5/77033_5.WAV,0,8343248,8406016,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_8343248_8406016.csv,2023-10-22 15:45:26,0.1.1,,
			
 
				+eaf_2023/ak,77033_5/77033_5.WAV,0,899827,962068,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_899827_962068.csv,2023-10-22 15:45:26,0.1.1,,
			
--- a/metadata/children.csv
+++ b/metadata/children.csv
@@ -0,0 +1,2 @@
 
				+child_id,experiment,child_dob,location_id,child_sex,language,monoling,languages,mat_ed,fat_ed,monoling_criterion,n_of_siblings,household_size,dob_criterion,dob_accuracy,discard

			
 
				+77033_5,MOVIN,2021-02-08,warsaw,m,polish,Y,,17,17,“we asked families which languages they spoke in the home”,0,3,exact,day,0
			
--- a/metadata/recordings.csv
+++ b/metadata/recordings.csv
@@ -0,0 +1,2 @@
 
				+experiment,child_id,date_iso,start_time,start_time_accuracy,recording_device_type,recording_filename,location_id,duration

			
 
				+MOVIN,77033_5,2023-04-14,NA,hour,usb,77033_5/77033_5.WAV,warsaw,19100960
			
--- a/recordings/.DS_Store
+++ b/recordings/.DS_Store
@@ -0,0 +1 @@
 
				+../.git/annex/objects/1J/83/MD5E-s6148--41244a732097548580484001c3dd5fa2/MD5E-s6148--41244a732097548580484001c3dd5fa2
			
--- a/recordings/converted/.DS_Store
+++ b/recordings/converted/.DS_Store
@@ -0,0 +1 @@
 
				+../../.git/annex/objects/0V/Zp/MD5E-s6148--689f59669593f9c6162bbe3e14168f7e/MD5E-s6148--689f59669593f9c6162bbe3e14168f7e
			
--- a/recordings/converted/standard/.DS_Store
+++ b/recordings/converted/standard/.DS_Store
@@ -0,0 +1 @@
 
				+../../../.git/annex/objects/85/3v/MD5E-s6148--f04bd34aa70c217c85aedec2e23603a0/MD5E-s6148--f04bd34aa70c217c85aedec2e23603a0
			
--- a/recordings/converted/standard/77033_5/77033_5.wav
+++ b/recordings/converted/standard/77033_5/77033_5.wav
@@ -0,0 +1 @@
 
				+../../../../.git/annex/objects/Z6/kM/MD5E-s1222461518--61eaea172b360dd9cbe00ab30c098738.wav/MD5E-s1222461518--61eaea172b360dd9cbe00ab30c098738.wav
			
--- a/recordings/converted/standard/parameters_20231022_154333.yml
+++ b/recordings/converted/standard/parameters_20231022_154333.yml
@@ -0,0 +1 @@
 
				+../../../.git/annex/objects/0k/q8/MD5E-s283--3f43e34b993dfc2955b4aec6317c980f.yml/MD5E-s283--3f43e34b993dfc2955b4aec6317c980f.yml
			
--- a/recordings/converted/standard/recordings.csv
+++ b/recordings/converted/standard/recordings.csv
@@ -0,0 +1 @@
 
				+../../../.git/annex/objects/fG/W1/MD5E-s266--5b78a6f73f034e09ad02ba52f0b31fdf.csv/MD5E-s266--5b78a6f73f034e09ad02ba52f0b31fdf.csv
			
--- a/recordings/raw/.DS_Store
+++ b/recordings/raw/.DS_Store
@@ -0,0 +1 @@
 
				+../../.git/annex/objects/ZG/g1/MD5E-s6148--3657552f3b819a7a1ab13b2bcebc843b/MD5E-s6148--3657552f3b819a7a1ab13b2bcebc843b
			
--- a/recordings/raw/77033_5/77033_5.wav
+++ b/recordings/raw/77033_5/77033_5.wav
@@ -0,0 +1 @@
 
				+../../../.git/annex/objects/j2/6G/MD5E-s5501080576--495c039cbf09501b1178d68d48d1cd60.wav/MD5E-s5501080576--495c039cbf09501b1178d68d48d1cd60.wav
			
--- a/scripts/calculate_annotation_chunks.py
+++ b/scripts/calculate_annotation_chunks.py
@@ -0,0 +1,44 @@
 
				+import glob
			
 
				+import os
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+import pympi
			
 
				+parent_directory = r"/Users/agatka/Library/CloudStorage/GoogleDrive-akoziol@sd.psych.pan.pl/Mój dysk/childproject-dataset/polish-dataset/annotations/"
			
 
				+os.chdir(parent_directory)
			
 
				+print("Current working directory: {0}".format(os.getcwd()))
			
 
				+chunk_break = 300000 #here put in miliseconds approximately how long is the shortest break between annotation chunks
			
 
				+for i, file in enumerate(glob.glob(os.path.join(parent_directory, '**'), recursive=True)):
			
 
				+    if os.path.isfile(os.path.join(parent_directory, file)):
			
 
				+        ext = file.split('.')
			
 
				+        if ext[-1] == 'eaf':
			
 
				+            print(ext[0].split('/')[-1])
			
 
				+            file_name = ext[0].split('/')[-1]
			
 
				+            elan_file = pympi.Elan.Eaf(file)
			
 
				+            #elan_data[file_name] = elan_file
			
 
				+            df = pd.DataFrame(columns=['StartTime', 'EndTime'])
			
 
				+            elan_file.get_full_time_interval()
			
 
				+
			
 
				+            for tier in elan_file.get_tier_names():
			
 
				+                for ann in elan_file.get_annotation_data_for_tier(tier):
			
 
				+                    df2 = pd.DataFrame({'StartTime': ann[0], 'EndTime': ann[1]}, index=[0])
			
 
				+                    df = pd.concat([df, df2], ignore_index=True)
			
 
				+            df = df.sort_values('StartTime').reset_index(drop=True)
			
 
				+            
			
 
				+            dif_st = np.diff(df['StartTime'].to_numpy())
			
 
				+            idx_st = [x+1 for x, val in enumerate(dif_st) if val >= chunk_break]
			
 
				+            start_times = df['StartTime'].to_numpy()[idx_st]
			
 
				+            start_times = np.insert(start_times, 0, df['StartTime'].to_numpy()[0])
			
 
				+
			
 
				+            df = df.sort_values('EndTime').reset_index(drop=True)
			
 
				+            dif_end = np.diff(df['EndTime'].to_numpy())
			
 
				+
			
 
				+            idx_end = [x for x, val in enumerate(dif_end) if val >= chunk_break]
			
 
				+            end_times = df['EndTime'].to_numpy()[idx_end]
			
 
				+            end_times = np.append(end_times, df['EndTime'].to_numpy()[-1])
			
 
				+
			
 
				+            final = pd.DataFrame(columns=['StartTime', 'EndTime'])
			
 
				+            final['StartTime'] = start_times
			
 
				+            final['EndTime'] = end_times
			
 
				+            print(final)
			
 
				+            os.makedirs(parent_directory+'/annotation_chunks/{0}/'.format(file_name), exist_ok=True)
			
 
				+            final.to_csv(parent_directory+'/annotation_chunks/{0}/{0}.csv'.format(file_name))
			
--- a/scripts/confusion_matrix2.py
+++ b/scripts/confusion_matrix2.py
@@ -0,0 +1,88 @@
 
				+from ChildProject.projects import ChildProject
			
 
				+from ChildProject.annotations import AnnotationManager
			
 
				+from ChildProject.metrics import segments_to_grid, conf_matrix
			
 
				+
			
 
				+import numpy as np
			
 
				+import matplotlib.pyplot as plt
			
 
				+
			
 
				+speakers = ['CHI', 'OCH', 'FEM', 'MAL'] #PUT HERE THE LABELS YOU WANT TO INCLUDE
			
 
				+project = ChildProject('.')
			
 
				+am = AnnotationManager(project)
			
 
				+am.read()
			
 
				+
			
 
				+SET_1 = 'eaf_2023/ak' #CHANGE THE FOLDER TO WHERE THE MANUAL ANNOTATIONS ARE
			
 
				+SET_2 = 'vtc' #CHANGE THE FOLDER TO WHERE VTC GENERATED ANNOTATIONS ARE
			
 
				+
			
 
				+intersection = AnnotationManager.intersection(am.annotations, [SET_1, SET_2])
			
 
				+
			
 
				+segments = am.get_collapsed_segments(intersection)
			
 
				+segments = segments[segments['speaker_type'].isin(speakers)]
			
 
				+
			
 
				+# Y
			
 
				+#vtc = segments_to_grid(segments[segments['set'] == 'vtc'], 0, segments['segment_offset'].max(), 100, 'speaker_type', speakers,none = False)
			
 
				+vtc = segments_to_grid(segments[segments['set'] == SET_1], 0, segments['segment_offset'].max(), 100, 'speaker_type', speakers)
			
 
				+
			
 
				+# X
			
 
				+#its = segments_to_grid(segments[segments['set'] == 'its'], 0, segments['segment_offset'].max(), 100, 'speaker_type', speakers,none = False)
			
 
				+its = segments_to_grid(segments[segments['set'] == SET_2], 0, segments['segment_offset'].max(), 100, 'speaker_type', speakers)
			
 
				+
			
 
				+confusion_counts = conf_matrix(vtc, its)
			
 
				+all_positive = np.delete(confusion_counts, -1, 0)
			
 
				+all_negative = np.delete(confusion_counts, -1, 1)
			
 
				+
			
 
				+precision = np.delete(all_negative, -1, 0).trace() / all_positive.sum()
			
 
				+recall = np.delete(all_negative, -1, 0).trace() / all_negative.sum()
			
 
				+fscore = (2 * precision * recall) / (precision + recall)
			
 
				+
			
 
				+scores = {}
			
 
				+i=0
			
 
				+
			
 
				+with open('scores.txt','w') as f:
			
 
				+    for label in speakers:
			
 
				+        rec = confusion_counts[i,i] / confusion_counts[ :,i].sum()
			
 
				+        preci = confusion_counts[i,i] / confusion_counts[i,: ].sum()
			
 
				+        fsc = (2 * preci * rec) / (preci + rec)
			
 
				+        #scores[label] = (preci, rec, fsc)
			
 
				+        f.write(f"{label}: precision {preci}; recall {rec}; F-score {fsc}\n")
			
 
				+        i+=1
			
 
				+
			
 
				+    f.write(f"General: precision {precision}; recall {recall}; F-score {fscore}\n")
			
 
				+    #print(f"General: precision {precision}; recall {recall}; F-score {fscore}")
			
 
				+
			
 
				+print(f"Results written to scores.txt")
			
 
				+
			
 
				+normalized = confusion_counts
			
 
				+
			
 
				+speakers.append("None")
			
 
				+speakers = [""] + speakers
			
 
				+
			
 
				+fig, ax = plt.subplots(figsize=(7.5, 7.5))
			
 
				+ax.set_xticklabels(speakers)  
			
 
				+ax.set_yticklabels(speakers)
			
 
				+ax.matshow(normalized, cmap=plt.cm.Blues, alpha=0.3)
			
 
				+for i in range(normalized.shape[0]):
			
 
				+    for j in range(normalized.shape[1]):
			
 
				+        ax.text(x=j, y=i,s=round(normalized[i, j],3), va='center', ha='center', size='xx-large')
			
 
				+ 
			
 
				+ax.xaxis.set_label_position("top")
			
 
				+# set Y and X
			
 
				+plt.ylabel(SET_1, fontsize=18)
			
 
				+plt.xlabel(SET_2, fontsize=18)
			
 
				+plt.title('Confusion Matrix', fontsize=18)
			
 
				+plt.savefig('conf_matrix.png')
			
 
				+
			
 
				+normalized = confusion_counts/(np.sum(vtc, axis = 0)[:,None])
			
 
				+
			
 
				+fig, ax = plt.subplots(figsize=(7.5, 7.5))
			
 
				+ax.set_xticklabels(speakers)  
			
 
				+ax.set_yticklabels(speakers)
			
 
				+ax.matshow(normalized, cmap=plt.cm.Blues, alpha=0.3)
			
 
				+for i in range(normalized.shape[0]):
			
 
				+    for j in range(normalized.shape[1]):
			
 
				+        ax.text(x=j, y=i,s=round(normalized[i, j],3), va='center', ha='center', size='xx-large')
			
 
				+ 
			
 
				+ax.xaxis.set_label_position("top")
			
 
				+plt.ylabel(SET_1, fontsize=18)
			
 
				+plt.xlabel(SET_2, fontsize=18)
			
 
				+plt.title('Confusion Matrix', fontsize=18)
			
 
				+plt.savefig('conf_matrix_normalized.png')
			
--- a/scripts/import.py
+++ b/scripts/import.py
@@ -0,0 +1,38 @@
 
				+#!/usr/bin/env python3
			
 
				+import pandas as pd
			
 
				+import os
			
 
				+from ChildProject.projects import ChildProject
			
 
				+from ChildProject.annotations import AnnotationManager
			
 
				+
			
 
				+dataset_path = "."
			
 
				+### !!!! EDIT THIS SECTION !!!! ### UNCOMMENT ONE AT A TIME AND RUN ONE AT A TIME
			
 
				+#annot_type = {"set":"vtc","file_extension":".rttm","format":"vtc_rttm"} # UNCOMMENT FOR VTC ANNOTATIONS
			
 
				+annot_type = {"set":"alice/output","file_extension":".txt","format":"alice"} # UNCOMMENT FOR ALICE ANNOTATIONS
			
 
				+#annot_type = {"set":"vcm","file_extension":".vcm","format":"vcm_rttm"} # UNCOMMENT FOR VCM ANNOTATIONS
			
 
				+###################################
			
 
				+
			
 
				+#load the project
			
 
				+project = ChildProject(dataset_path)
			
 
				+# load the annotation manager for our project
			
 
				+am = AnnotationManager(project)
			
 
				+
			
 
				+# we take a copy of the recordings.csv file of the dataset, that suits us because we have one importation per recording, as is usually the case with automated annotations
			
 
				+input_frame = pd.DataFrame.copy(project.recordings)
			
 
				+
			
 
				+# let's drop every column that we don't need
			
 
				+input_frame.drop(['experiment', 'child_id', 'date_iso', 'start_time', 'recording_device_type'], axis = 1, inplace = True)
			
 
				+
			
 
				+#make sure that the duration for the recordings is set in recordings.csv, otherwise run child-project compute-durations /path
			
 
				+
			
 
				+input_frame["raw_filename"]= input_frame.apply(lambda row: os.path.splitext(row["recording_filename"])[0] + annot_type["file_extension"], axis=1) #take the name of the audio and add extension of the annotation (so this assumes the annotation file has the same name as the audio appart from extension)
			
 
				+input_frame["set"] = annot_type["set"] #set to import to
			
 
				+input_frame["format"] = annot_type["format"] #format of the annotation
			
 
				+input_frame["time_seek"] = "0" # timestamps in the file don't need to be shifted
			
 
				+input_frame["range_onset"] = "0" #from the start of the audio              ...
			
 
				+input_frame["range_offset"] = input_frame["duration"] # ...to the end
			
 
				+
			
 
				+BP_RECS = ['77033_5/77033_5.WAV']
			
 
				+input_frame = input_frame[input_frame["recording_filename"].isin(BP_RECS)] #only keep bp recs
			
 
				+input_frame = input_frame.drop(columns=['duration'])
			
 
				+
			
 
				+am.import_annotations(input_frame)
			
--- a/scripts/import_eaf_poland.py
+++ b/scripts/import_eaf_poland.py
@@ -0,0 +1,231 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Mon Jan  2 14:53:20 2023
			
 
				+
			
 
				+@author: lpeurey
			
 
				+
			
 
				+Manage  the importation of eaf 2022 annotation campaign
			
 
				+custom converter to import properly
			
 
				+"""
			
 
				+import glob
			
 
				+from pathlib import Path
			
 
				+
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+import pympi
			
 
				+from collections import defaultdict
			
 
				+import os
			
 
				+
			
 
				+from ChildProject.projects import ChildProject
			
 
				+from ChildProject.annotations import AnnotationManager
			
 
				+
			
 
				+SPEAKER_ID_TO_TYPE = defaultdict(
			
 
				+        lambda: "NA",
			
 
				+        {
			
 
				+            "CHI": "CHI",
			
 
				+            "FEM": "FEM",
			
 
				+            "MAL": "MAL",
			
 
				+            "OCH": "OCH",
			
 
				+        },
			
 
				+    )
			
 
				+
			
 
				+VCM_MAPPING = {
			
 
				+        'A':'N',
			
 
				+        'P':'N',
			
 
				+        'W':'C',
			
 
				+        'V':'C',
			
 
				+        'L':'L',
			
 
				+        'Y':'Y',
			
 
				+        'U':'U',
			
 
				+        }
			
 
				+XDS_MAPPING = {
			
 
				+        'T':'T',
			
 
				+        'C':'C',
			
 
				+        'B':'A,C',
			
 
				+        'A':'A',
			
 
				+        'P':'P',
			
 
				+        'O':'O',
			
 
				+        'U':'U',
			
 
				+        }
			
 
				+
			
 
				+BP_RECS = ['77033_5/77033_5.WAV']
			
 
				+def convert(filename: str, filter=None, **kwargs) -> pd.DataFrame:
			
 
				+
			
 
				+    eaf = pympi.Elan.Eaf(filename)
			
 
				+    
			
 
				+    segments = {}
			
 
				+    for tier_name in eaf.tiers:
			
 
				+        print(tier_name)
			
 
				+        annotations = eaf.tiers[tier_name][0]
			
 
				+        if (
			
 
				+            tier_name not in SPEAKER_ID_TO_TYPE
			
 
				+            and len(annotations) > 0
			
 
				+        ):
			
 
				+            print(
			
 
				+                "warning: unknown tier '{}' will be ignored in '{}'".format(
			
 
				+                    tier_name, filename
			
 
				+                )
			
 
				+            )
			
 
				+            continue
			
 
				+    
			
 
				+        for aid in annotations:
			
 
				+            (start_ts, end_ts, value, svg_ref) = annotations[aid]
			
 
				+            (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
			
 
				+    
			
 
				+            segment = {
			
 
				+                "segment_onset": int(round(start_t)),
			
 
				+                "segment_offset": int(round(end_t)),
			
 
				+                "speaker_id": tier_name,
			
 
				+                "speaker_type": SPEAKER_ID_TO_TYPE[tier_name],
			
 
				+                "vcm_type": "NA",
			
 
				+                "vcm_type_precise": "NA",
			
 
				+                "msc_type": "NA",
			
 
				+                "xds_type": "NA",
			
 
				+                "gra_type": "NA",
			
 
				+                "addressee": "NA",
			
 
				+            }
			
 
				+    
			
 
				+            segments[aid] = segment
			
 
				+    
			
 
				+    for tier_name in eaf.tiers:
			
 
				+        if "@" in tier_name:
			
 
				+            label, ref = tier_name.split("@")
			
 
				+        else:
			
 
				+            label, ref = tier_name, None
			
 
				+    
			
 
				+        reference_annotations = eaf.tiers[tier_name][1]
			
 
				+    
			
 
				+        if ref not in SPEAKER_ID_TO_TYPE:
			
 
				+            continue
			
 
				+    
			
 
				+        for aid in reference_annotations:
			
 
				+            (ann, value, prev, svg) = reference_annotations[aid]
			
 
				+    
			
 
				+            ann = aid
			
 
				+            parentTier = eaf.tiers[eaf.annotations[ann]]
			
 
				+            while (
			
 
				+                "PARENT_REF" in parentTier[2]
			
 
				+                and parentTier[2]["PARENT_REF"]
			
 
				+                and len(parentTier[2]) > 0
			
 
				+            ):
			
 
				+                ann = parentTier[1][ann][0]
			
 
				+                parentTier = eaf.tiers[eaf.annotations[ann]]
			
 
				+    
			
 
				+            if ann not in segments:
			
 
				+                print(
			
 
				+                    "warning: annotation '{}' not found in segments for '{}'".format(
			
 
				+                        ann, filename
			
 
				+                    )
			
 
				+                )
			
 
				+                continue
			
 
				+    
			
 
				+            segment = segments[ann]
			
 
				+    
			
 
				+            if value: #discard segments that have no label (kept NA)
			
 
				+                if label == "vcm":
			
 
				+                    segment["vcm_type"] = VCM_MAPPING[value]
			
 
				+                    segment["vcm_type_precise"] = value
			
 
				+                elif label == "msc":
			
 
				+                    segment["msc_type"] = value
			
 
				+                elif label == "gra":
			
 
				+                    segment["gra_type"] = value
			
 
				+                elif label == "xds":
			
 
				+                    segment["addressee"] = XDS_MAPPING[value]
			
 
				+                
			
 
				+    
			
 
				+    return pd.DataFrame(segments.values())
			
 
				+BP_REC = ['77033_5.eaf']
			
 
				+chunk_break = 300000 #here put in miliseconds approximately how long is the shortest break between annotation chunks
			
 
				+if __name__ == '__main__' :
			
 
				+    
			
 
				+    project = ChildProject('.')
			
 
				+    am = AnnotationManager(project)
			
 
				+
			
 
				+    files = pd.DataFrame([
			
 
				+        {'raw_filename': f}
			
 
				+        for f in glob.glob('./annotations/eaf_2023/ak/raw/*/*.eaf') if f.split('/')[-1] in BP_REC
			
 
				+    ])
			
 
				+
			
 
				+    files['time_seek'] = 0
			
 
				+    files['raw_filename'] = files['raw_filename'].apply(os.path.basename)
			
 
				+    files['recording_filename'] = files['raw_filename'].apply(lambda x: x.split('.')[-2] + '/' + x.split('.')[-2] + '.WAV')
			
 
				+    # files = files[files['recording_filename'].isin(project.recordings['recording_filename'])]
			
 
				+    files['set'] = 'eaf_2023/ak'
			
 
				+    files['format'] = 'eaf'
			
 
				+    print(files)
			
 
				+    _files = []
			
 
				+
			
 
				+    for f in files.to_dict(orient='records'):
			
 
				+        eaf = pympi.Elan.Eaf(Path('./annotations') / 'eaf_2023' / 'ak' / 'raw' / f['raw_filename'].split('.')[0] / f['raw_filename'])
			
 
				+
			
 
				+        df = pd.DataFrame(columns=['range_onset', 'range_offset'])
			
 
				+        eaf.get_full_time_interval()
			
 
				+
			
 
				+        for tier in eaf.get_tier_names():
			
 
				+            for ann in eaf.get_annotation_data_for_tier(tier):
			
 
				+                df2 = pd.DataFrame({'range_onset': ann[0], 'range_offset': ann[1]}, index=[0])
			
 
				+                df = pd.concat([df, df2], ignore_index=True)
			
 
				+        df = df.sort_values('range_onset').reset_index(drop=True)
			
 
				+
			
 
				+        dif_st = np.diff(df['range_onset'].to_numpy())
			
 
				+        idx_st = [x + 1 for x, val in enumerate(dif_st) if val >= chunk_break]
			
 
				+        start_times = df['range_onset'].to_numpy()[idx_st]
			
 
				+        start_times = np.insert(start_times, 0, df['range_onset'].to_numpy()[0])
			
 
				+
			
 
				+        df = df.sort_values('range_offset').reset_index(drop=True)
			
 
				+        dif_end = np.diff(df['range_offset'].to_numpy())
			
 
				+
			
 
				+        idx_end = [x for x, val in enumerate(dif_end) if val >= chunk_break]
			
 
				+        end_times = df['range_offset'].to_numpy()[idx_end]
			
 
				+        end_times = np.append(end_times, df['range_offset'].to_numpy()[-1])
			
 
				+
			
 
				+        final = pd.DataFrame(columns=['range_onset', 'range_offset'])
			
 
				+        final['range_onset'] = start_times
			
 
				+        final['range_offset'] = end_times
			
 
				+        final['time_seek'] = 0
			
 
				+        final['raw_filename'] = '77033_5/77033_5.eaf'
			
 
				+        final['recording_filename'] = '77033_5/77033_5.WAV'
			
 
				+        final['format'] = 'eaf'
			
 
				+        final['set'] = 'eaf_2023/ak'
			
 
				+
			
 
				+
			
 
				+        #_files.append(pd.DataFrame([f]))
			
 
				+
			
 
				+        # for tier_name in ['CHI', 'FEM', 'MAL', 'OCH']:
			
 
				+        #     portions = eaf.tiers[tier_name][0] #tier names
			
 
				+        #
			
 
				+        #     for pid in portions:
			
 
				+        #         (start_ts, end_ts, value, svg_ref) = portions[pid]
			
 
				+        #         (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
			
 
				+        #
			
 
				+        #         # if value.upper() != 'Y':
			
 
				+        #         #    continue
			
 
				+        #         f['tier'] = tier_name
			
 
				+        #         f['range_onset'] = start_t
			
 
				+        #         f['range_offset'] = end_t
			
 
				+        #
			
 
				+        #         _files.append(pd.DataFrame([f]))
			
 
				+
			
 
				+
			
 
				+    #import_df = pd.concat(_files).reset_index(drop=True)
			
 
				+    import_df = final.reset_index(drop=True)
			
 
				+    print(import_df)
			
 
				+    
			
 
				+    # import_df = project.recordings[['recording_filename', 'duration']]
			
 
				+    # import_df = import_df[import_df["recording_filename"].isin(BP_RECS)] #only keep bp recs
			
 
				+    # import_df.rename(columns={'duration':'range_offset'}, inplace=True)
			
 
				+    #
			
 
				+    # #import_df['set'] = 'eaf_2022/an1' # first batch
			
 
				+    # import_df['set'] = 'eaf_2023/ak' #import bautista's annotations
			
 
				+    #
			
 
				+    # import_df['time_seek'] = 0
			
 
				+    # import_df['range_onset'] = 0
			
 
				+    # import_df['format'] = 'eaf'
			
 
				+    #
			
 
				+    # #import_df['raw_filename'] = import_df['recording_filename'].apply(lambda x: RECORDINGS_MAPPING[x])
			
 
				+    # import_df['raw_filename'] = import_df['recording_filename'].apply(lambda x: os.path.basename(x.replace(".WAV",".eaf")))
			
 
				+
			
 
				+    #print(import_df)
			
 
				+    am.import_annotations(import_df, threads=1, import_function=convert, overwrite_existing=True)
			
 
				+
		`@@ -0,0 +1 @@`
		`+.git/annex/objects/w1/8z/MD5E-s8196--e4ad73ec75c7e39a8524034b8edb17cf/MD5E-s8196--e4ad73ec75c7e39a8524034b8edb17cf`
		`@@ -0,0 +1 @@`
		`+../.git/annex/objects/ZK/kz/MD5E-s6148--fbb42a91672c814bafa77c819961d737/MD5E-s6148--fbb42a91672c814bafa77c819961d737`
		`@@ -0,0 +1 @@`
		`+../../.git/annex/objects/xQ/09/MD5E-s6148--2edf891008094080170efe3e57150f96/MD5E-s6148--2edf891008094080170efe3e57150f96`
		`@@ -0,0 +1 @@`
		`+../../../.git/annex/objects/vw/7M/MD5E-s6148--8b44f2d28f2a8a70bf265ce6a261ca0d/MD5E-s6148--8b44f2d28f2a8a70bf265ce6a261ca0d`
		`@@ -0,0 +1 @@`
		`+../../../../../.git/annex/objects/34/fp/MD5E-s2049--e079d0128832ef5873d6cd05ec920d20.csv/MD5E-s2049--e079d0128832ef5873d6cd05ec920d20.csv`
		`@@ -0,0 +1 @@`
		`+../../../../../.git/annex/objects/M1/gG/MD5E-s2689--22e7ba5ca977585a43773123ea189531.csv/MD5E-s2689--22e7ba5ca977585a43773123ea189531.csv`
		`@@ -0,0 +1 @@`
		`+../../../../../.git/annex/objects/gJ/7Q/MD5E-s2241--fb605b52094c03679a225991b097a3e3.csv/MD5E-s2241--fb605b52094c03679a225991b097a3e3.csv`
		`@@ -0,0 +1 @@`
		`+../../../../../.git/annex/objects/7G/Pf/MD5E-s4779--57a54837fc9a4437912122a8f8e88c35.csv/MD5E-s4779--57a54837fc9a4437912122a8f8e88c35.csv`
		`@@ -0,0 +1 @@`
		`+../../../../../.git/annex/objects/8Z/kp/MD5E-s2177--615c45205e56fea6994f1baeccb63723.csv/MD5E-s2177--615c45205e56fea6994f1baeccb63723.csv`
		`@@ -0,0 +1 @@`
		`+../../../../../.git/annex/objects/Kj/9z/MD5E-s577--f9d916bc51322fb0a114dcae246a96ae.csv/MD5E-s577--f9d916bc51322fb0a114dcae246a96ae.csv`