1234567891011121314151617181920212223242526272829303132333435363738394041424344 |
- import glob
- import os
- import numpy as np
- import pandas as pd
- import pympi
- parent_directory = r"/Users/agatka/Library/CloudStorage/GoogleDrive-akoziol@sd.psych.pan.pl/Mój dysk/childproject-dataset/polish-dataset/annotations/"
- os.chdir(parent_directory)
- print("Current working directory: {0}".format(os.getcwd()))
- chunk_break = 300000 #here put in miliseconds approximately how long is the shortest break between annotation chunks
- for i, file in enumerate(glob.glob(os.path.join(parent_directory, '**'), recursive=True)):
- if os.path.isfile(os.path.join(parent_directory, file)):
- ext = file.split('.')
- if ext[-1] == 'eaf':
- print(ext[0].split('/')[-1])
- file_name = ext[0].split('/')[-1]
- elan_file = pympi.Elan.Eaf(file)
- #elan_data[file_name] = elan_file
- df = pd.DataFrame(columns=['StartTime', 'EndTime'])
- elan_file.get_full_time_interval()
- for tier in elan_file.get_tier_names():
- for ann in elan_file.get_annotation_data_for_tier(tier):
- df2 = pd.DataFrame({'StartTime': ann[0], 'EndTime': ann[1]}, index=[0])
- df = pd.concat([df, df2], ignore_index=True)
- df = df.sort_values('StartTime').reset_index(drop=True)
-
- dif_st = np.diff(df['StartTime'].to_numpy())
- idx_st = [x+1 for x, val in enumerate(dif_st) if val >= chunk_break]
- start_times = df['StartTime'].to_numpy()[idx_st]
- start_times = np.insert(start_times, 0, df['StartTime'].to_numpy()[0])
- df = df.sort_values('EndTime').reset_index(drop=True)
- dif_end = np.diff(df['EndTime'].to_numpy())
- idx_end = [x for x, val in enumerate(dif_end) if val >= chunk_break]
- end_times = df['EndTime'].to_numpy()[idx_end]
- end_times = np.append(end_times, df['EndTime'].to_numpy()[-1])
- final = pd.DataFrame(columns=['StartTime', 'EndTime'])
- final['StartTime'] = start_times
- final['EndTime'] = end_times
- print(final)
- os.makedirs(parent_directory+'/annotation_chunks/{0}/'.format(file_name), exist_ok=True)
- final.to_csv(parent_directory+'/annotation_chunks/{0}/{0}.csv'.format(file_name))
|