|
@@ -166,7 +166,7 @@ df.icc.mixed = data.frame(matrix(ncol=length(df.icc.mixed.cols),nrow=0, dimnames
|
|
|
|
|
|
for (data_set in data_sets){ # data_set = "aclew"
|
|
|
# Load data
|
|
|
- mydat <- read.csv(paste0('../DATA/', data_set,'_metrics_scaled.csv'))
|
|
|
+ mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
|
|
|
metrics <- colnames(mydat)[!is.element(colnames(mydat), no.scale.columns)]
|
|
|
for(metric in metrics)
|
|
|
{ # metric = "voc_chi_ph"
|
|
@@ -175,7 +175,7 @@ for (data_set in data_sets){ # data_set = "aclew"
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-write.csv(df.icc.mixed,"../OUTPUT/df.icc.mixed.csv",row.names=F)
|
|
|
+write.csv(df.icc.mixed,"../output/df.icc.mixed.csv",row.names=F)
|
|
|
|
|
|
|
|
|
# repeat for the version within each corpus
|
|
@@ -194,7 +194,7 @@ df.icc.corpus = data.frame(matrix(ncol=length(df.icc.corpus.cols),nrow=0, dimnam
|
|
|
|
|
|
for (data_set in data_sets){ # data_set = "aclew"
|
|
|
# Load data
|
|
|
- mydat <- read.csv(paste0('../DATA/', data_set,'_metrics_scaled.csv'))
|
|
|
+ mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
|
|
|
for(corpus in corpora){
|
|
|
mycordat <- mydat[mydat$experiment == corpus, ]
|
|
|
metrics <- colnames(mycordat)[!is.element(colnames(mycordat), no.scale.columns)]
|
|
@@ -207,11 +207,11 @@ for (data_set in data_sets){ # data_set = "aclew"
|
|
|
}
|
|
|
|
|
|
|
|
|
-write.csv(df.icc.corpus,"../OUTPUT/df.icc.corpus.csv",row.names=F)
|
|
|
+write.csv(df.icc.corpus,"../output/df.icc.corpus.csv",row.names=F)
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
-mydat <- read.csv(paste0('../DATA/', 'aclew','_metrics.csv'))
|
|
|
+mydat <- read.csv(paste0('../data_output/', 'aclew','_metrics.csv'))
|
|
|
|
|
|
child_per_corpus = setNames(aggregate(data = mydat, child_id ~ experiment, function(child_id) length(unique(child_id))), c('experiment', 'No_Children'))
|
|
|
rec_per_corpus = setNames(aggregate(data = mydat, session_id ~ experiment, function(session_id) length(unique(session_id))), c('experiment', 'No_Rec'))
|
|
@@ -231,7 +231,7 @@ corp_desc_list = list(corp_code, child_per_corpus, rec_per_corpus, dur_per_corpu
|
|
|
corpus_description <- Reduce(function(x, y) merge(x, y, all=TRUE), corp_desc_list)
|
|
|
corpus_description <- transform(corpus_description, Age_Range=paste(Min_Age, Max_Age, sep="-"))
|
|
|
corpus_description <- subset(corpus_description, select = -c(Min_Age, Max_Age))
|
|
|
-write.csv(corpus_description, "../OUTPUT/corpus_description.csv", sep='\t')
|
|
|
+write.csv(corpus_description, "../output/corpus_description.csv", sep='\t')
|
|
|
nkids=length(levels(factor(paste(mydat$experiment,mydat$child_id))))
|
|
|
```
|
|
|
|
|
@@ -259,7 +259,7 @@ df.icc.age = data.frame(matrix(ncol=length(df.icc.age.cols),nrow=0, dimnames=lis
|
|
|
|
|
|
for (data_set in data_sets){ # data_set = "aclew"
|
|
|
# Load data and calculate age cuts
|
|
|
- mydat <- read.csv(paste0('../DATA/', data_set,'_metrics.csv')) # /!\ Do not use scaled version -> we'll scale by age later
|
|
|
+ mydat <- read.csv(paste0('../data_output/', data_set,'_metrics.csv')) # /!\ Do not use scaled version -> we'll scale by age later
|
|
|
mydat$age_bin <- cut(mydat$age,c(0:6*6))
|
|
|
|
|
|
metrics = colnames(mydat)[!is.element(colnames(mydat), no.scale.columns)]
|
|
@@ -290,7 +290,7 @@ for (data_set in data_sets){ # data_set = "aclew"
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
-write.csv(df.icc.age,"../OUTPUT/df.icc.age.csv",row.names=F)
|
|
|
+write.csv(df.icc.age,"../output/df.icc.age.csv",row.names=F)
|
|
|
```
|
|
|
|
|
|
|
|
@@ -298,20 +298,19 @@ write.csv(df.icc.age,"../OUTPUT/df.icc.age.csv",row.names=F)
|
|
|
## Describe datasets
|
|
|
|
|
|
```{r}
|
|
|
-df.icc.mixed<-read.csv("../OUTPUT/df.icc.mixed.csv")
|
|
|
+df.icc.mixed<-read.csv("../output/df.icc.mixed.csv")
|
|
|
|
|
|
```
|
|
|
|
|
|
We are looking here at `r length(corpora)` corpora, `r nkids` children, `r max(df.icc.mixed$nobs[df.icc.mixed$data_set=="lena"])` recordings, `r length(levels(factor(df.icc.mixed$metric)))` many metrics.
|
|
|
<!-- The number of children comes from nkids, in the first chunk that is not evaluated. -->
|
|
|
|
|
|
-Note that for LENA
|
|
|
|
|
|
|
|
|
## Reliability analyses combining all corpora
|
|
|
|
|
|
```{r icc-allexp, echo=F,fig.width=4, fig.height=3,fig.cap="Distribution of ICC attributed to corpus (a) and children (b), when combining data from all corpora."}
|
|
|
-df.icc.mixed<-read.csv("../OUTPUT/df.icc.mixed.csv")
|
|
|
+df.icc.mixed<-read.csv("../output/df.icc.mixed.csv")
|
|
|
|
|
|
icc_exp <- ggplot(df.icc.mixed, aes(x = icc_corpus, fill = toupper(data_set))) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC corpus")+
|
|
|
geom_jitter( aes(x = icc_corpus,y=0,colour=toupper(data_set)))+ scale_fill_colorblind() +scale_colour_manual(values=cbPalette) + theme(text = element_text(size = 20)) + ylim(-0.5,11.25) + xlim(0,1) + labs(fill='Pipeline', color="Pipeline")
|
|
@@ -357,15 +356,15 @@ kable(x,row.names = F,digits=2,caption="Most commonly used metrics.")
|
|
|
|
|
|
```
|
|
|
|
|
|
-```{r icc-examples, echo=F,fig.width=4, fig.height=3,fig.cap="(A) scatterplot for one variable with relatively low ICCs versus (B) one with relatively higher ICCs (see Tables 1-2 for details)"}
|
|
|
+```{r icc-examples-fig2, echo=F,fig.width=4, fig.height=3,fig.cap="(A) scatterplot for one variable with relatively low ICCs versus (B) one with relatively higher ICCs (see Tables 1-2 for details)"}
|
|
|
# figure of bad ICC: lena avg_voc_dur_chi; good ICC: lena voc_och_ph
|
|
|
data_set="lena"
|
|
|
-mydat <- read.csv(paste0('../DATA/', data_set,'_metrics_scaled.csv'))
|
|
|
+mydat <- read.csv(paste0('../data_output/', data_set,'_metrics.csv'))
|
|
|
+mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
|
|
|
mydat <- mydat[is.element(mydat$experiment, corpora),]
|
|
|
|
|
|
|
|
|
-#remove outliers
|
|
|
-#for(metric in c("avg_voc_dur_chi","voc_och_ph")) mydat[, metric] <- ifelse(scale(mydat[, metric])>2.5 | scale(mydat[, metric]) < -2.5,NA,mydat[, metric])
|
|
|
+
|
|
|
|
|
|
# remove those data points altogether
|
|
|
mydat <- mydat[!is.na(mydat$avg_voc_dur_chi) & !is.na(mydat$voc_och_ph),]
|
|
@@ -388,16 +387,50 @@ for(thischild in levels(as.factor(mydat$child_id))){
|
|
|
)
|
|
|
}
|
|
|
}
|
|
|
-colnames(mysample)<-c("child_id","corpus","age","unique1","avg_voc_dur_chi1","voc_och_ph1","unique2","avg_voc_dur_chi2","voc_och_ph2")
|
|
|
-mysample$corpus=factor(mysample$corpus)
|
|
|
-bad <- ggplot(mysample, aes(avg_voc_dur_chi1,avg_voc_dur_chi2)) + geom_point(aes(colour = factor(corpus))) +
|
|
|
- geom_smooth(method='lm', formula= y~x) + labs(color = "Corpus") + theme(text = element_text(size = 20)) + xlim(-2,2) + ylim(-2,2)
|
|
|
-good <- ggplot(mysample, aes(voc_och_ph1,voc_och_ph2)) + geom_point(aes(colour = factor(corpus))) +
|
|
|
- geom_smooth(method='lm', formula= y~x) + labs(color = "Corpus") + theme(text = element_text(size = 20)) + xlim(-2,2) + ylim(-2,2)
|
|
|
+colnames(mysample) <-
|
|
|
+ c(
|
|
|
+ "child_id",
|
|
|
+ "corpus",
|
|
|
+ "age",
|
|
|
+ "unique1",
|
|
|
+ "avg_voc_dur_chi1",
|
|
|
+ "voc_och_ph1",
|
|
|
+ "unique2",
|
|
|
+ "avg_voc_dur_chi2",
|
|
|
+ "voc_och_ph2"
|
|
|
+ )
|
|
|
+
|
|
|
+mysample$corpus = factor(mysample$corpus)
|
|
|
+
|
|
|
+mylimits=range(mysample[,c("avg_voc_dur_chi1","avg_voc_dur_chi2")])
|
|
|
+
|
|
|
+bad <-
|
|
|
+ ggplot(mysample, aes(avg_voc_dur_chi1, avg_voc_dur_chi2)) +
|
|
|
+ geom_point(aes(colour = factor(corpus))) +
|
|
|
+ geom_smooth(method = 'lm', formula = y ~ x) +
|
|
|
+ labs(color = "Corpus") +
|
|
|
+ theme(text = element_text(size = 20)) +
|
|
|
+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
|
|
|
+panel.background = element_blank(), axis.line = element_line(colour = "black")) +
|
|
|
+ scale_x_continuous(name="Avg chi voc dur rec 1",limits=mylimits) +
|
|
|
+ scale_y_continuous(name="Avg chi voc dur rec 2",limits=mylimits) +
|
|
|
+ geom_abline(intercept = 0, slope = 1)
|
|
|
+
|
|
|
+mylimits=range(mysample[,c("voc_och_ph1","voc_och_ph2")])
|
|
|
+good <-
|
|
|
+ ggplot(mysample, aes(voc_och_ph1, voc_och_ph2)) +
|
|
|
+ geom_point(aes(colour = factor(corpus))) +
|
|
|
+ geom_smooth(method = 'lm', formula = y ~ x) +
|
|
|
+ labs(color = "Corpus") +
|
|
|
+ theme(text = element_text(size = 20)) +
|
|
|
+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
|
|
|
+panel.background = element_blank(), axis.line = element_line(colour = "black")) +
|
|
|
+ scale_x_continuous(name="N other ch voc rec 1",limits=mylimits) +
|
|
|
+ scale_y_continuous(name="N other ch voc rec 2",limits=mylimits) +
|
|
|
+ geom_abline(intercept = 0, slope = 1)
|
|
|
|
|
|
|
|
|
ggarrange(bad, good,
|
|
|
- labels = c("A", "B"),
|
|
|
ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.5, hjust=0,
|
|
|
font.label = list(size = 20)) + labs(color= "Corpus") + theme(text = element_text(size = 20))
|
|
|
|
|
@@ -406,7 +439,7 @@ ggarrange(bad, good,
|
|
|
|
|
|
```{r reg model}
|
|
|
|
|
|
-read.csv("../OUTPUT/df.icc.mixed.csv")->df.icc.mixed
|
|
|
+read.csv("../output/df.icc.mixed.csv")->df.icc.mixed
|
|
|
|
|
|
df.icc.mixed$subject[grepl("chi", df.icc.mixed$metric, fixed = TRUE)] <- "chi"
|
|
|
df.icc.mixed$subject[df.icc.mixed$metric %in% c("lp_dur","lp_n","lena_CVC","cp_dur","cp_n")] <- "chi"
|
|
@@ -441,7 +474,7 @@ summary(lr_icc_chi_common)
|
|
|
nsamples=10
|
|
|
|
|
|
data_set="lena"
|
|
|
-mydat <- read.csv(paste0('../DATA/', data_set,'_metrics_scaled.csv'))
|
|
|
+mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
|
|
|
mydat$uchild_id <- paste(mydat$experiment, mydat$child_id)
|
|
|
mydat$usession_id <- paste(mydat$uchild_id, mydat$session_id)
|
|
|
mydat=mydat[order(mydat$experiment,mydat$child_id,mydat$age),]
|
|
@@ -466,7 +499,7 @@ sum(table(dist_contig$experiment[!duplicated(dist_contig$uchild_id)])) #and over
|
|
|
#given those two numbers, with 5 draws we'd cover many combinations in winni, lucid, & trio; but we'll do 10 because there are a lot of recs in cougar & bergelson..
|
|
|
|
|
|
data_set="aclew"
|
|
|
-mydat_aclew <- read.csv(paste0('../DATA/', data_set,'_metrics_scaled.csv'))
|
|
|
+mydat_aclew <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
|
|
|
mydat_aclew$uchild_id <- paste(mydat_aclew$experiment, mydat_aclew$child_id)
|
|
|
mydat_aclew$usession_id <- paste(mydat_aclew$uchild_id, mydat_aclew$session_id)
|
|
|
|
|
@@ -517,7 +550,7 @@ print(mean_sd_rvalue)
|
|
|
## Reliability analyses per corpus
|
|
|
|
|
|
```{r icc-bycor, echo=F,fig.width=4, fig.height=10,fig.cap="Distribution of ICC attributed to children in each separate corpus."}
|
|
|
-df.icc.corpus<-read.csv("../OUTPUT/df.icc.corpus.csv")
|
|
|
+df.icc.corpus<-read.csv("../output/df.icc.corpus.csv")
|
|
|
|
|
|
ggplot(df.icc.corpus, aes(x = icc_adjusted, fill = toupper(data_set))) +
|
|
|
geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +
|
|
@@ -532,7 +565,7 @@ ggplot(df.icc.corpus, aes(x = icc_adjusted, fill = toupper(data_set))) +
|
|
|
## Reliability by child age
|
|
|
|
|
|
```{r relBYage, echo=F,fig.width=6, fig.height=10,fig.cap="Distribution of ICC attributed to corpus (a) and children (b), when binning children's age."}
|
|
|
-df.icc.age<-read.csv("../OUTPUT/df.icc.age.csv")
|
|
|
+df.icc.age<-read.csv("../output/df.icc.age.csv")
|
|
|
|
|
|
df.icc.age$age_bin<-factor(df.icc.age$age_bin,levels=c("(0,6]" , "(6,12]", "(12,18]" ,"(18,24]" ,"(24,30]", "(30,36]", "(36,42]", "(42,48]", "(48,54]" ))
|
|
|
|
|
@@ -622,9 +655,9 @@ ggarrange(allcor, bycor,
|
|
|
|
|
|
|
|
|
```{r}
|
|
|
-df.icc.mixed<-read.csv("../OUTPUT/df.icc.mixed.csv")
|
|
|
-df.icc.age<-read.csv("../OUTPUT/df.icc.age.csv")
|
|
|
-df.icc.corpus<-read.csv("../OUTPUT/df.icc.corpus.csv")
|
|
|
+df.icc.mixed<-read.csv("../output/df.icc.mixed.csv")
|
|
|
+df.icc.age<-read.csv("../output/df.icc.age.csv")
|
|
|
+df.icc.corpus<-read.csv("../output/df.icc.corpus.csv")
|
|
|
```
|
|
|
|
|
|
```{r}
|