Procházet zdrojové kódy

major renaming and cleaning

alecristia před 1 rokem
rodič
revize
b029652dd8

+ 0 - 34
.gitmodules

@@ -1,37 +1,3 @@
-[submodule "DATASETS/cougar"]
-	path = DATASETS/cougar
-	url = git@gin.g-node.org:/LAAC-LSCP/cougar.git
-	datalad-url = git@gin.g-node.org:/LAAC-LSCP/cougar.git
-[submodule "DATASETS/lucid"]
-	path = DATASETS/lucid
-	url = git@gin.g-node.org:/LAAC-LSCP/lucid.git
-	datalad-url = git@gin.g-node.org:/LAAC-LSCP/lucid.git
-[submodule "DATASETS/winnipeg"]
-	path = DATASETS/winnipeg
-	url = git@gin.g-node.org:/LAAC-LSCP/winnipeg.git
-	datalad-url = git@gin.g-node.org:/LAAC-LSCP/winnipeg.git
-[submodule "DATASETS/warlaumont"]
-	path = DATASETS/warlaumont
-	url = git@gin.g-node.org:/LAAC-LSCP/warlaumont.git
-	datalad-url = git@gin.g-node.org:/LAAC-LSCP/warlaumont.git
-[submodule "DATASETS/bergelson"]
-	path = DATASETS/bergelson
-	url = git@gin.g-node.org:/LAAC-LSCP/bergelson.git
-	datalad-url = git@gin.g-node.org:/LAAC-LSCP/bergelson.git
-[submodule "DATASETS/fausey-trio"]
-	path = DATASETS/fausey-trio
-	url = git@gin.g-node.org:/LAAC-LSCP/fausey-trio.git
-	datalad-url = git@gin.g-node.org:/LAAC-LSCP/fausey-trio.git
-[submodule "DATASETS/laac-metrics"]
-	path = DATASETS/laac-metrics
-	url = git@gin.g-node.org:/LAAC-LSCP/laac-metrics.git
-	datalad-id = c409d94b-f4a8-4f7d-a543-9451a78ff5d5
-	datalad-url = git@gin.g-node.org:/LAAC-LSCP/laac-metrics.git
-[submodule "DATASETS/el1000-metrics"]
-	path = DATASETS/el1000-metrics
-	url = git@gin.g-node.org:/LAAC-LSCP/el1000-metrics.git
-	datalad-id = d0c2f00a-7498-4d93-b588-8d96204767a8
-	datalad-url = git@gin.g-node.org:/LAAC-LSCP/el1000-metrics.git
 [submodule "input/laac-metrics"]
 	path = input/laac-metrics
 	url = git@gin.g-node.org:/LAAC-LSCP/laac-metrics.git

+ 63 - 30
CODE/all-analyses.Rmd

@@ -166,7 +166,7 @@ df.icc.mixed = data.frame(matrix(ncol=length(df.icc.mixed.cols),nrow=0, dimnames
 
 for (data_set in data_sets){   # data_set = "aclew"
   # Load data
-  mydat <- read.csv(paste0('../DATA/', data_set,'_metrics_scaled.csv'))
+  mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
   metrics <- colnames(mydat)[!is.element(colnames(mydat), no.scale.columns)]
   for(metric in metrics) 
   {  # metric = "voc_chi_ph"
@@ -175,7 +175,7 @@ for (data_set in data_sets){   # data_set = "aclew"
   } 
 }
 
-write.csv(df.icc.mixed,"../OUTPUT/df.icc.mixed.csv",row.names=F)
+write.csv(df.icc.mixed,"../output/df.icc.mixed.csv",row.names=F)
 
 
 # repeat for the version within each corpus
@@ -194,7 +194,7 @@ df.icc.corpus = data.frame(matrix(ncol=length(df.icc.corpus.cols),nrow=0, dimnam
 
 for (data_set in data_sets){   # data_set = "aclew"
     # Load data 
-    mydat <- read.csv(paste0('../DATA/', data_set,'_metrics_scaled.csv'))
+    mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
     for(corpus in corpora){
       mycordat <- mydat[mydat$experiment == corpus, ]
       metrics <- colnames(mycordat)[!is.element(colnames(mycordat), no.scale.columns)]
@@ -207,11 +207,11 @@ for (data_set in data_sets){   # data_set = "aclew"
 }
 
 
-write.csv(df.icc.corpus,"../OUTPUT/df.icc.corpus.csv",row.names=F)
+write.csv(df.icc.corpus,"../output/df.icc.corpus.csv",row.names=F)
 ```
 
 ```{r}
-mydat <- read.csv(paste0('../DATA/', 'aclew','_metrics.csv'))
+mydat <- read.csv(paste0('../data_output/', 'aclew','_metrics.csv'))
 
 child_per_corpus = setNames(aggregate(data = mydat, child_id ~ experiment, function(child_id) length(unique(child_id))), c('experiment', 'No_Children'))
 rec_per_corpus = setNames(aggregate(data = mydat, session_id ~ experiment, function(session_id) length(unique(session_id))), c('experiment', 'No_Rec'))
@@ -231,7 +231,7 @@ corp_desc_list = list(corp_code, child_per_corpus, rec_per_corpus, dur_per_corpu
 corpus_description <- Reduce(function(x, y) merge(x, y, all=TRUE), corp_desc_list)
 corpus_description <- transform(corpus_description, Age_Range=paste(Min_Age, Max_Age, sep="-"))
 corpus_description <- subset(corpus_description, select = -c(Min_Age, Max_Age))
-write.csv(corpus_description, "../OUTPUT/corpus_description.csv", sep='\t')
+write.csv(corpus_description, "../output/corpus_description.csv", sep='\t')
 nkids=length(levels(factor(paste(mydat$experiment,mydat$child_id))))
 ```
 
@@ -259,7 +259,7 @@ df.icc.age = data.frame(matrix(ncol=length(df.icc.age.cols),nrow=0, dimnames=lis
 
 for (data_set in data_sets){   # data_set = "aclew"
   # Load data and calculate age cuts
-  mydat <- read.csv(paste0('../DATA/', data_set,'_metrics.csv')) # /!\ Do not use scaled version -> we'll scale by age later
+  mydat <- read.csv(paste0('../data_output/', data_set,'_metrics.csv')) # /!\ Do not use scaled version -> we'll scale by age later
   mydat$age_bin <- cut(mydat$age,c(0:6*6))
   
   metrics = colnames(mydat)[!is.element(colnames(mydat), no.scale.columns)]
@@ -290,7 +290,7 @@ for (data_set in data_sets){   # data_set = "aclew"
     } 
   }
 }
-write.csv(df.icc.age,"../OUTPUT/df.icc.age.csv",row.names=F)
+write.csv(df.icc.age,"../output/df.icc.age.csv",row.names=F)
 ```
 
 
@@ -298,20 +298,19 @@ write.csv(df.icc.age,"../OUTPUT/df.icc.age.csv",row.names=F)
 ## Describe datasets
 
 ```{r}
-df.icc.mixed<-read.csv("../OUTPUT/df.icc.mixed.csv")
+df.icc.mixed<-read.csv("../output/df.icc.mixed.csv")
 
 ```
 
 We are looking here at `r length(corpora)` corpora, `r nkids` children, `r max(df.icc.mixed$nobs[df.icc.mixed$data_set=="lena"])` recordings, `r length(levels(factor(df.icc.mixed$metric)))` many metrics.
 <!-- The number of children comes from nkids, in the first chunk that is not evaluated. -->
 
-Note that for LENA 
 
 
 ## Reliability analyses combining all corpora
 
 ```{r icc-allexp, echo=F,fig.width=4, fig.height=3,fig.cap="Distribution of ICC attributed to corpus (a) and children (b), when combining data from all corpora."}
-df.icc.mixed<-read.csv("../OUTPUT/df.icc.mixed.csv")
+df.icc.mixed<-read.csv("../output/df.icc.mixed.csv")
 
 icc_exp <- ggplot(df.icc.mixed, aes(x = icc_corpus, fill = toupper(data_set))) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC corpus")+
   geom_jitter( aes(x = icc_corpus,y=0,colour=toupper(data_set)))+ scale_fill_colorblind() +scale_colour_manual(values=cbPalette)  +  theme(text = element_text(size = 20))  + ylim(-0.5,11.25) + xlim(0,1) + labs(fill='Pipeline', color="Pipeline") 
@@ -357,15 +356,15 @@ kable(x,row.names = F,digits=2,caption="Most commonly used metrics.")
 
 ```
 
-```{r icc-examples, echo=F,fig.width=4, fig.height=3,fig.cap="(A) scatterplot for one variable with relatively low ICCs versus (B) one with relatively higher ICCs (see Tables 1-2 for details)"}
+```{r icc-examples-fig2, echo=F,fig.width=4, fig.height=3,fig.cap="(A) scatterplot for one variable with relatively low ICCs versus (B) one with relatively higher ICCs (see Tables 1-2 for details)"}
 # figure of bad ICC: lena     avg_voc_dur_chi; good ICC: lena voc_och_ph
 data_set="lena"
-mydat <- read.csv(paste0('../DATA/', data_set,'_metrics_scaled.csv'))
+mydat <- read.csv(paste0('../data_output/', data_set,'_metrics.csv'))
+mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
 mydat <- mydat[is.element(mydat$experiment, corpora),]
 
 
-#remove outliers
-#for(metric in c("avg_voc_dur_chi","voc_och_ph")) mydat[, metric] <- ifelse(scale(mydat[, metric])>2.5 | scale(mydat[, metric]) < -2.5,NA,mydat[, metric])
+
 
 # remove those data points altogether
 mydat <- mydat[!is.na(mydat$avg_voc_dur_chi) & !is.na(mydat$voc_och_ph),]
@@ -388,16 +387,50 @@ for(thischild in levels(as.factor(mydat$child_id))){
     )
   } 
 }
-colnames(mysample)<-c("child_id","corpus","age","unique1","avg_voc_dur_chi1","voc_och_ph1","unique2","avg_voc_dur_chi2","voc_och_ph2")
-mysample$corpus=factor(mysample$corpus)
-bad <- ggplot(mysample, aes(avg_voc_dur_chi1,avg_voc_dur_chi2)) + geom_point(aes(colour = factor(corpus))) + 
-  geom_smooth(method='lm', formula= y~x)  + labs(color = "Corpus")  +  theme(text = element_text(size = 20)) + xlim(-2,2) + ylim(-2,2)
-good <- ggplot(mysample, aes(voc_och_ph1,voc_och_ph2)) + geom_point(aes(colour = factor(corpus))) + 
-  geom_smooth(method='lm', formula= y~x)  + labs(color = "Corpus")  +  theme(text = element_text(size = 20)) + xlim(-2,2) + ylim(-2,2)
+colnames(mysample) <-
+  c(
+    "child_id",
+    "corpus",
+    "age",
+    "unique1",
+    "avg_voc_dur_chi1",
+    "voc_och_ph1",
+    "unique2",
+    "avg_voc_dur_chi2",
+    "voc_och_ph2"
+  )
+
+mysample$corpus = factor(mysample$corpus)
+
+mylimits=range(mysample[,c("avg_voc_dur_chi1","avg_voc_dur_chi2")])
+
+bad <-
+  ggplot(mysample, aes(avg_voc_dur_chi1, avg_voc_dur_chi2)) + 
+  geom_point(aes(colour = factor(corpus))) +
+  geom_smooth(method = 'lm', formula = y ~ x)  +
+  labs(color = "Corpus")  +  
+  theme(text = element_text(size = 20)) + 
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+panel.background = element_blank(), axis.line = element_line(colour = "black")) +
+  scale_x_continuous(name="Avg chi voc dur rec 1",limits=mylimits) +
+  scale_y_continuous(name="Avg chi voc dur rec 2",limits=mylimits) +
+  geom_abline(intercept = 0, slope = 1)
+
+mylimits=range(mysample[,c("voc_och_ph1","voc_och_ph2")])
+good <-
+  ggplot(mysample, aes(voc_och_ph1, voc_och_ph2)) + 
+  geom_point(aes(colour = factor(corpus))) +
+  geom_smooth(method = 'lm', formula = y ~ x)  + 
+  labs(color = "Corpus")  +  
+  theme(text = element_text(size = 20)) +
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+panel.background = element_blank(), axis.line = element_line(colour = "black")) +
+  scale_x_continuous(name="N other ch voc rec 1",limits=mylimits) +
+  scale_y_continuous(name="N other ch voc rec 2",limits=mylimits) +
+  geom_abline(intercept = 0, slope = 1)
 
 
 ggarrange(bad, good,
-          labels = c("A", "B"),
           ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.5, hjust=0,
           font.label = list(size = 20))  + labs(color= "Corpus")  +  theme(text = element_text(size = 20))
 
@@ -406,7 +439,7 @@ ggarrange(bad, good,
 
 ```{r reg model}
 
-read.csv("../OUTPUT/df.icc.mixed.csv")->df.icc.mixed
+read.csv("../output/df.icc.mixed.csv")->df.icc.mixed
 
 df.icc.mixed$subject[grepl("chi", df.icc.mixed$metric, fixed = TRUE)] <- "chi"
 df.icc.mixed$subject[df.icc.mixed$metric %in% c("lp_dur","lp_n","lena_CVC","cp_dur","cp_n")] <- "chi"
@@ -441,7 +474,7 @@ summary(lr_icc_chi_common)
 nsamples=10
 
 data_set="lena"
-mydat <- read.csv(paste0('../DATA/', data_set,'_metrics_scaled.csv'))
+mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
 mydat$uchild_id <- paste(mydat$experiment, mydat$child_id)
 mydat$usession_id <- paste(mydat$uchild_id, mydat$session_id)
 mydat=mydat[order(mydat$experiment,mydat$child_id,mydat$age),]
@@ -466,7 +499,7 @@ sum(table(dist_contig$experiment[!duplicated(dist_contig$uchild_id)])) #and over
 #given those two numbers, with 5 draws we'd cover many combinations in winni, lucid, & trio; but we'll do 10 because there are a lot of recs in cougar & bergelson..
 
 data_set="aclew"
-mydat_aclew <- read.csv(paste0('../DATA/', data_set,'_metrics_scaled.csv'))
+mydat_aclew <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
 mydat_aclew$uchild_id <- paste(mydat_aclew$experiment, mydat_aclew$child_id)
 mydat_aclew$usession_id <- paste(mydat_aclew$uchild_id, mydat_aclew$session_id)
 
@@ -517,7 +550,7 @@ print(mean_sd_rvalue)
 ## Reliability analyses per corpus
 
 ```{r icc-bycor, echo=F,fig.width=4, fig.height=10,fig.cap="Distribution of ICC attributed to children in each separate corpus."}
-df.icc.corpus<-read.csv("../OUTPUT/df.icc.corpus.csv")
+df.icc.corpus<-read.csv("../output/df.icc.corpus.csv")
 
 ggplot(df.icc.corpus, aes(x = icc_adjusted, fill = toupper(data_set))) + 
   geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +
@@ -532,7 +565,7 @@ ggplot(df.icc.corpus, aes(x = icc_adjusted, fill = toupper(data_set))) +
 ## Reliability by child age
 
 ```{r relBYage, echo=F,fig.width=6, fig.height=10,fig.cap="Distribution of ICC attributed to corpus (a) and children (b), when binning children's age."}
-df.icc.age<-read.csv("../OUTPUT/df.icc.age.csv")
+df.icc.age<-read.csv("../output/df.icc.age.csv")
 
 df.icc.age$age_bin<-factor(df.icc.age$age_bin,levels=c("(0,6]" , "(6,12]",  "(12,18]" ,"(18,24]" ,"(24,30]", "(30,36]", "(36,42]", "(42,48]", "(48,54]" ))
 
@@ -622,9 +655,9 @@ ggarrange(allcor, bycor,
 
 
 ```{r}
-df.icc.mixed<-read.csv("../OUTPUT/df.icc.mixed.csv")
-df.icc.age<-read.csv("../OUTPUT/df.icc.age.csv")
-df.icc.corpus<-read.csv("../OUTPUT/df.icc.corpus.csv")
+df.icc.mixed<-read.csv("../output/df.icc.mixed.csv")
+df.icc.age<-read.csv("../output/df.icc.age.csv")
+df.icc.corpus<-read.csv("../output/df.icc.corpus.csv")
 ```
 
 ```{r}

+ 21 - 19
CODE/regenerate_data.R

@@ -1,20 +1,20 @@
 # This code cannot be reproduced without access to the underlying datasets
 data_sets = c('aclew', 'lena')
-corpora <- c('bergelson', 'lucid', 'winnipeg', 'warlaumont','cougar','fausey-trio') 
+corpora <- c('bergelson', 'lucid', 'winnipeg', 'warlaumont','cougar','fausey-trio','lyon','quechua') 
 no.scale.columns <- c('experiment', 'session_id', 'child_id','child_id_unique','age_s',
                       'date_iso', 'child_dob', 'missing_audio',"age_bin","duration","usession_id",
                       "normative","age","duration_alice", "duration_vcm" ,  "duration_vtc","duration_its" )
 
 
 for (data_set in data_sets){ #data_set="aclew"
-  mydat <- read.csv(paste0('../DATASETS/el1000-metrics/output/', data_set,'_metrics.csv'))
+  mydat <- read.csv(paste0('../input/el1000-metrics/output/', data_set,'_metrics.csv'))
   mydat <- mydat[is.element(mydat$experiment, corpora),]
   
-  mydat2 <- read.csv(paste0('../DATASETS/laac-metrics/output/', data_set,'_metrics.csv'))
-  mydat2 <- mydat2[mydat2$experiment== "fausey-trio",]
+  mydat2 <- read.csv(paste0('../input/laac-metrics/output/', data_set,'_metrics.csv'))
+  mydat2 <- mydat2[mydat2$experiment %in% corpora,]
   
   # Remove FauseyElse
-  fausey_trio <- read.csv('../DATASETS/fausey-trio/metadata/recordings.csv')
+  fausey_trio <- read.csv('../input/laac-metrics/datasets/fausey-trio/metadata/recordings.csv')
   fausey_trio_full <- fausey_trio[fausey_trio$Trio_Subset == "Trio_Full", ]
   fausey_trio_full$session_id <- paste0(fausey_trio_full$HomeBank_ID, "/", fausey_trio_full$fileName)
   mydat2 <- mydat2[mydat2$session_id %in% fausey_trio_full$session_id,]
@@ -23,37 +23,39 @@ for (data_set in data_sets){ #data_set="aclew"
   mydat=rbind(mydat,mydat2)
   
   # Remove Cougar non-normatives
-  cougar <- read.csv('../DATASETS/el1000-metrics/EL1000/cougar/metadata/children.csv')
+  cougar <- read.csv('../input/el1000-metrics/EL1000/cougar/metadata/children.csv')
   cougar_normative <- cougar[cougar$normative == "Y", ]
   mydat <- mydat[mydat$experiment != 'cougar' | mydat$child_id %in% cougar_normative$child_id, ]
   
   # Save data
-  write.csv(mydat,paste0('../DATA/', data_set,'_base_data_set.csv'),row.names = F)
-  print(paste0('Save to ', paste0('../DATA/', data_set,'_base_data_set.csv')))
+  write.csv(mydat,paste0('../data_output/', data_set,'_base_data_set.csv'),row.names = F)
+  print(paste0('Save to ', paste0('../data_output/', data_set,'_base_data_set.csv')))
 }
 
 
 for (data_set in data_sets){
-  mydat <- read.csv(paste0('../DATA/', data_set,'_base_data_set.csv'))
+  mydat <- read.csv(paste0('../data_output/', data_set,'_base_data_set.csv'))
   mydat$age_s=scale(mydat$age)
   mydat$age_s=(mydat$age - mean(mydat$age , na.rm=T))/sd(mydat$age , na.rm=T)
+
+  
+  #remove outliers
+  for(metric in metrics) mydat[abs((mydat[,metric]-mean(mydat[,metric], na.rm=T))/sd(mydat[,metric], na.rm=T)) > 2.5 , metric]<-NA 
+  #NA values that are beyond 2.5 SD from mean
   
-  write.csv(mydat,paste0('../DATA/', data_set,'_metrics.csv'),row.names = F) #all variables are unscaled, except for age
-  print(paste0('Save to ', paste0('../DATA/', data_set,'_metrics.csv')))
+    
+  write.csv(mydat,paste0('../data_output/', data_set,'_metrics.csv'),row.names = F) #all variables are unscaled, except for age
+  print(paste0('Save to ', paste0('../data_output/', data_set,'_metrics.csv')))
   
   metrics = colnames(mydat)[!is.element(colnames(mydat), no.scale.columns)]
   for(metric in metrics){ #metric="pc_mal_ph"
-    # Scale mydat
-    pre_scaled_metric <- (mydat[, metric] - mean(mydat[, metric], na.rm=T))/sd(mydat[, metric], na.rm=T)
-    
-    #remove outliers
-    mydat[abs(pre_scaled_metric)>2.5 & !is.na(pre_scaled_metric), metric]<-NA #NA values that are beyond 2.5 SD from mean
+   
     
-    # Rescale with outliers removed
+    # Scale mydat
     mydat[, metric] <- (mydat[, metric] - mean(mydat[, metric], na.rm=T))/sd(mydat[, metric], na.rm=T)
   }
   
   # Save data
-  write.csv(mydat,paste0('../DATA/', data_set,'_metrics_scaled.csv'),row.names = F)
-  print(paste0('Save to ', paste0('../DATA/', data_set,'_metrics_scaled.csv')))
+  write.csv(mydat,paste0('../data_output/', data_set,'_metrics_scaled.csv'),row.names = F)
+  print(paste0('Save to ', paste0('../data_output/', data_set,'_metrics_scaled.csv')))
 }

+ 0 - 1
DATASETS/bergelson

@@ -1 +0,0 @@
-Subproject commit 2f0c91839f4832b2456474f3f5d456b5c7e8394c

+ 0 - 1
DATASETS/cougar

@@ -1 +0,0 @@
-Subproject commit 0450082f812cbe0f3f654a29c21d9b5b62feea81

+ 0 - 1
DATASETS/el1000-metrics

@@ -1 +0,0 @@
-Subproject commit a15d11d97285d5055aae31f3455e2c83c65876bd

+ 0 - 1
DATASETS/fausey-trio

@@ -1 +0,0 @@
-Subproject commit 769f0f92ef82b6da340fe6c2e60c17096e1be592

+ 0 - 1
DATASETS/laac-metrics

@@ -1 +0,0 @@
-Subproject commit 2012f139b768636519dafae214d60bdfef757526

+ 0 - 1
DATASETS/lucid

@@ -1 +0,0 @@
-Subproject commit a293bebb72180e5a3dbce3dcc263dc8ffac85cad

+ 0 - 1
DATASETS/warlaumont

@@ -1 +0,0 @@
-Subproject commit e7945b24844ec352587728ad517b609c2d028806

+ 0 - 1
DATASETS/winnipeg

@@ -1 +0,0 @@
-Subproject commit 6f15a2f4b01bbf3c0026891d838374640c8d635d

DATA/aclew_base_data_set.csv → data_output/aclew_base_data_set.csv


DATA/aclew_metrics.csv → data_output/aclew_metrics.csv


DATA/aclew_metrics_scaled.csv → data_output/aclew_metrics_scaled.csv


DATA/children.csv → data_output/children.csv


DATA/lena_base_data_set.csv → data_output/lena_base_data_set.csv


DATA/lena_metrics.csv → data_output/lena_metrics.csv


DATA/lena_metrics_scaled.csv → data_output/lena_metrics_scaled.csv