calculate_annotation_chunks.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. import glob
  2. import os
  3. import numpy as np
  4. import pandas as pd
  5. import pympi
  6. parent_directory = r"/Users/agatka/Library/CloudStorage/GoogleDrive-akoziol@sd.psych.pan.pl/Mój dysk/childproject-dataset/polish-dataset/annotations/"
  7. os.chdir(parent_directory)
  8. print("Current working directory: {0}".format(os.getcwd()))
  9. chunk_break = 300000 #here put in miliseconds approximately how long is the shortest break between annotation chunks
  10. for i, file in enumerate(glob.glob(os.path.join(parent_directory, '**'), recursive=True)):
  11. if os.path.isfile(os.path.join(parent_directory, file)):
  12. ext = file.split('.')
  13. if ext[-1] == 'eaf':
  14. print(ext[0].split('/')[-1])
  15. file_name = ext[0].split('/')[-1]
  16. elan_file = pympi.Elan.Eaf(file)
  17. #elan_data[file_name] = elan_file
  18. df = pd.DataFrame(columns=['StartTime', 'EndTime'])
  19. elan_file.get_full_time_interval()
  20. for tier in elan_file.get_tier_names():
  21. for ann in elan_file.get_annotation_data_for_tier(tier):
  22. df2 = pd.DataFrame({'StartTime': ann[0], 'EndTime': ann[1]}, index=[0])
  23. df = pd.concat([df, df2], ignore_index=True)
  24. df = df.sort_values('StartTime').reset_index(drop=True)
  25. dif_st = np.diff(df['StartTime'].to_numpy())
  26. idx_st = [x+1 for x, val in enumerate(dif_st) if val >= chunk_break]
  27. start_times = df['StartTime'].to_numpy()[idx_st]
  28. start_times = np.insert(start_times, 0, df['StartTime'].to_numpy()[0])
  29. df = df.sort_values('EndTime').reset_index(drop=True)
  30. dif_end = np.diff(df['EndTime'].to_numpy())
  31. idx_end = [x for x, val in enumerate(dif_end) if val >= chunk_break]
  32. end_times = df['EndTime'].to_numpy()[idx_end]
  33. end_times = np.append(end_times, df['EndTime'].to_numpy()[-1])
  34. final = pd.DataFrame(columns=['StartTime', 'EndTime'])
  35. final['StartTime'] = start_times
  36. final['EndTime'] = end_times
  37. print(final)
  38. os.makedirs(parent_directory+'/annotation_chunks/{0}/'.format(file_name), exist_ok=True)
  39. final.to_csv(parent_directory+'/annotation_chunks/{0}/{0}.csv'.format(file_name))