regenerate_data.R 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. # This code cannot be reproduced without access to the underlying datasets
  2. data_sets = c('aclew', 'lena')
  3. corpora <- c('bergelson', 'lucid', 'winnipeg', 'warlaumont','cougar','fausey-trio','lyon','quechua')
  4. no.scale.columns <- c('experiment', 'session_id', 'child_id','child_id_unique','age_s',
  5. 'date_iso', 'child_dob', 'missing_audio',"age_bin","duration","usession_id",
  6. "normative","age","duration_alice", "duration_vcm" , "duration_vtc","duration_its" )
  7. for (data_set in data_sets){ #data_set="aclew"
  8. mydat <- read.csv(paste0('../input/el1000-metrics/output/', data_set,'_metrics.csv'))
  9. mydat <- mydat[is.element(mydat$experiment, corpora),]
  10. mydat2 <- read.csv(paste0('../input/laac-metrics/output/', data_set,'_metrics.csv'))
  11. mydat2 <- mydat2[mydat2$experiment %in% corpora,]
  12. # Remove FauseyElse
  13. fausey_trio <- read.csv('../input/laac-metrics/datasets/fausey-trio/metadata/recordings.csv')
  14. fausey_trio_full <- fausey_trio[fausey_trio$Trio_Subset == "Trio_Full", ]
  15. fausey_trio_full$session_id <- paste0(fausey_trio_full$HomeBank_ID, "/", fausey_trio_full$fileName)
  16. mydat2 <- mydat2[mydat2$session_id %in% fausey_trio_full$session_id,]
  17. #note that since we are only taking fausey-trio, then the fact that png2019 & tsimane2017 are repeated across laac & el1000 is not a problem
  18. mydat=rbind(mydat,mydat2)
  19. # Remove Cougar non-normatives
  20. cougar <- read.csv('../input/el1000-metrics/EL1000/cougar/metadata/children.csv')
  21. cougar_normative <- cougar[cougar$normative == "Y", ]
  22. mydat <- mydat[mydat$experiment != 'cougar' | mydat$child_id %in% cougar_normative$child_id, ]
  23. # Save data
  24. write.csv(mydat,paste0('../data_output/', data_set,'_base_data_set.csv'),row.names = F)
  25. print(paste0('Save to ', paste0('../data_output/', data_set,'_base_data_set.csv')))
  26. }
  27. for (data_set in data_sets){
  28. mydat <- read.csv(paste0('../data_output/', data_set,'_base_data_set.csv'))
  29. mydat$age_s=scale(mydat$age)
  30. mydat$age_s=(mydat$age - mean(mydat$age , na.rm=T))/sd(mydat$age , na.rm=T)
  31. #remove outliers
  32. for(metric in metrics) mydat[abs((mydat[,metric]-mean(mydat[,metric], na.rm=T))/sd(mydat[,metric], na.rm=T)) > 2.5 , metric]<-NA
  33. #NA values that are beyond 2.5 SD from mean
  34. write.csv(mydat,paste0('../data_output/', data_set,'_metrics.csv'),row.names = F) #all variables are unscaled, except for age
  35. print(paste0('Save to ', paste0('../data_output/', data_set,'_metrics.csv')))
  36. metrics = colnames(mydat)[!is.element(colnames(mydat), no.scale.columns)]
  37. for(metric in metrics){ #metric="pc_mal_ph"
  38. # Scale mydat
  39. mydat[, metric] <- (mydat[, metric] - mean(mydat[, metric], na.rm=T))/sd(mydat[, metric], na.rm=T)
  40. }
  41. # Save data
  42. write.csv(mydat,paste0('../data_output/', data_set,'_metrics_scaled.csv'),row.names = F)
  43. print(paste0('Save to ', paste0('../data_output/', data_set,'_metrics_scaled.csv')))
  44. }