01-generate_stimuli.R 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632
  1. library(readr)
  2. library(dplyr)
  3. library(tidyr)
  4. library(purrr)
  5. library(forcats)
  6. library(ggplot2)
  7. library(stringi)
  8. library(LexOPS)
  9. library(ggExtra)
  10. library(gghalves)
  11. library(patchwork)
  12. library(overlapping)
  13. library(parallel)
  14. library(stringdist)
  15. n_cores <- 12
  16. # import stimuli data -----------------------------------------------------
  17. boss <- read_csv("boss.csv") %>%
  18. # lowercase words
  19. mutate(modal_name = stri_trans_tolower(modal_name)) %>%
  20. # percentages as numbers
  21. mutate_at(vars(dplyr::matches("perc")), function(x) as.numeric(sub("%", "", x))) %>%
  22. # no spaces
  23. filter(!grepl(" ", modal_name)) %>%
  24. # join to LexOPS
  25. full_join(lexops, by = c("modal_name" = "string")) %>%
  26. # make note of whether a picture's modal name
  27. mutate(is_pic = ifelse(is.na(filename), "no", "yes"))
  28. swow <- read_csv("swow_ppmi.csv")
  29. # useful functions for selecting stimuli ----------------------------------
  30. # function to calculate cosine similarity
  31. calc_cos <- function(a, b) {
  32. if (length(a) != length(b)) stop("Inconsistent vector lengths")
  33. sum(a * b) / sqrt(sum(a^2)*sum(b^2))
  34. }
  35. # function to calculate associative strength for SWOW
  36. assoc_str <- function(cues_a, cues_b, .swow = swow, print = FALSE, sim_measure = "R123.Strength") {
  37. if (length(cues_a) != length(cues_b)) stop("Inconsistent vector lengths")
  38. print_perc_vals <- seq(0, 100, by = 0.1)
  39. print_iters <- if (print) {
  40. lapply(1:length(cues_a), function(i) {
  41. perc_done <- (i / length(cues_a)) * 100
  42. if (round(perc_done, 1) %in% print_perc_vals) {
  43. print_perc_vals <<- print_perc_vals[print_perc_vals != round(perc_done, 1)]
  44. i
  45. } else {
  46. NULL
  47. }
  48. })
  49. } else {
  50. c()
  51. }
  52. sapply(1:length(cues_a), function(i) {
  53. cues_a_i <- as.character(cues_a[[i]])
  54. cues_b_i <- as.character(cues_b[[i]])
  55. if (identical(cues_a_i, cues_b_i)) {
  56. cos_i <- if (cues_a_i %in% .swow$cue) 1 else NA
  57. } else if (!all(c(cues_a_i, cues_b_i) %in% .swow$cue)) {
  58. cos_i <- NA
  59. } else {
  60. cues_neighbours <- .swow %>%
  61. dplyr::filter(cue %in% c(cues_a_i, cues_b_i)) %>%
  62. dplyr::mutate(cue = dplyr::case_when(
  63. cue == cues_a_i ~ "cue_a",
  64. cue == cues_b_i ~ "cue_b"
  65. )) %>%
  66. select(cue, response, !!dplyr::sym(sim_measure)) %>%
  67. tidyr::pivot_wider(names_from = cue, values_from = !!dplyr::sym(sim_measure), names_prefix = "p_") %>%
  68. #dplyr::filter(!is.na(p_cue_a), !is.na(p_cue_b))
  69. tidyr::replace_na(list(p_cue_a = 0, p_cue_b = 0))
  70. cos_i <- calc_cos(cues_neighbours$p_cue_a, cues_neighbours$p_cue_b)
  71. }
  72. if (print & i %in% print_iters) {
  73. perc_done <- i / length(cues_a) * 100
  74. cat(sprintf("%i/%i (%.1f%%), \"%s\" ~ \"%s\" = %s\n", i, length(cues_a), round(perc_done, 1), cues_a_i, cues_b_i, cos_i))
  75. }
  76. cos_i
  77. })
  78. }
  79. # assoc_str(c("cat", "cat", "cat"), c("cat", "teacher", "jungle"))
  80. # assoc_str(c("cat", "cat", "cat"), c("cat", "teacher", "jungle"), sim_measure = "ppmi")
  81. # function to get semantic similarity for swow (direct neighbours)
  82. cos_sim <- function(matches, target) {
  83. if (all(matches==target)) return(rep(1, length(matches)))
  84. assoc_str(rep(target, length(matches)), matches, sim_measure = "ppmi")
  85. }
  86. # cos_sim(c("cat", "teacher", "jungle", "dog"), "cat")
  87. # x <- cos_sim(sample(lexops$string, 5000), "cat")
  88. # function for maximising levenshtein distance, assuming that length is matched exactly
  89. # (returns value of 1 if maximum distance possible, otherwise 0)
  90. maximise_lev_dist <- function(xsource, targets) {
  91. dists <- stringdist(a=xsource, b=targets, method="lv")
  92. as.numeric(dists==nchar(xsource))
  93. }
  94. # LexOPS pipeline ---------------------------------------------------------
  95. stim_seed <- 1
  96. # words which are much more likely to be produced by Americans/Canadians
  97. americanisms <- c("squash", "airplane", "candy", "bison", "buffalo", "sidewalk", "camper", "motel", "checkers", "store", "soccer", "trash", "burner", "boardwalk", "automobile", "cellphone", "tombstone", "mailbox", "panties", "cab", "yogurt", "pants")
  98. # words which are inappropriate and distracting
  99. inappropriate <- c("piss", "penis", "sperm", "cannabis", "poop", "breast", "marijuana", "blowjob", "boob", "porn", "pussy", "bosom", "vagina", "muff", "breasts", "ass", "pee", "rump", "vulva")
  100. # words which are unimageable, or describe things whose images would be very different to those in the normed set (e.g. 'library', when there are no buildings in the set) or are not actually nouns
  101. unimageable <- c("siding", "cemetery", "polo", "woods", "bowel", "lobby", "bladder", "library", "zoo", "volcano", "apartments", "pasture", "peroxide", "temple", "weep", "sunrise", "storm", "mosque", "indigo", "bookshop", "pharmacy", "bakery", "orphanage", "sunset", "meadow", "venom", "brewery", "campus", "bulletin", "canal", "ranch", "salon", "yoga", "canteen", "livestock", "mammal", "casino", "subway", "skies", "vaccine", "placenta", "landfill", "jump", "snapshot", "vault", "badminton", "vessel", "troll", "surface", "parasite", "asylum", "sowing", "barefoot", "swimming", "arcade", "town", "shaft", "shore", "birth", "catcher", "waltz", "stomp", "spa", "animal", "cattle", "inch", "jungle", "attire", "postage", "poultry", "station", "pregnancy", "beach", "waist", "midget", "burp", "skyline", "workplace", "judo", "primate", "yawn", "trench", "bum", "injury", "shipment", "karaoke", "workshop", "chapel", "orchard", "runway", "saloon", "womb", "laughter", "cinema", "insulin", "chlorine", "stream", "movie", "chord", "piles", "rainy", "bacteria", "ravine", "beast", "motorway", "monsoon", "terrace", "woodwind", "jail", "retina", "planet", "wetland", "morphine", "cafe", "siren", "plantation", "sunlight", "meter", "print", "mucus", "sleet", "visa", "narcotics", "steroids", "urine", "blink", "appliance", "turquoise", "voice", "racing", "uterus", "chime", "slap", "nod", "pet", "bottom", "nudge", "army", "apartment", "village", "nightclub", "morgue", "kick", "city", "cellar", "outdoors", "squint", "booty", "lab", "street", "streets", "shock", "massacre", "item", "yard", "carnival", "structure", "prairie", "nudity", "shred", "nursery", "belch", "tint", "kilometer", "kilogram", "beep", "opera", "pageant", "swatter", "booze", "wiring", "vomit", "rodeo", "surf", "buffet", "bite", "chant", "swine", "mansion", "mill", "shack", "cabin", "alley", "lump", "ballot", "supper", "material", "lobe", "steel", "venison", "beef", "butt", "liquor", "pop", "office", "museum", "lookout", "port", "vitamin", "tee", "church", "labyrinth", "utensil")
  102. # words which describe people (no images of people in the normed set of images, only parts like 'hand')
  103. people <- c("tailor", "landlord", "waitress", "lifeguard", "parent", "swimmer", "cashier", "actor", "vendor", "bride", "broker", "salesman", "sailor", "pilot", "pirate", "redhead", "tutor", "runner", "brunette", "adult", "wizard", "chaps", "lawyer", "beekeeper", "priest", "kids", "uncle", "jockey", "surfer", "tenant", "violinist", "jury", "babysitter", "milkman", "astronaut", "beggar", "infant", "child", "robber", "baby", "talker", "tradesman", "lad", "magician", "plumber", "captain", "shooter", "nun", "prostitute", "pastor", "chauffeur", "rapper", "assassin", "bachelor", "stewardess", "visitor", "chairman", "chairwoman", "buyer", "sons", "banker", "veteran", "niece", "twins", "lady", "skater", "wrestler", "actress", "birth", "slave", "bomber", "boxer", "musician", "inmate", "worker", "kid", "carpenter", "physician", "blacksmith", "poet", "tribe", "shopper", "pilgrim", "ambassador", "chemist", "citizen", "male", "analyst", "chorus", "caveman", "butler", "brigade", "publisher", "roommate", "jogging", "mistress", "writer", "accountant", "mum", "attendant", "seaman", "monarch", "audience", "umpire", "brother", "typist", "scout", "assistant", "witch", "singer", "superhero", "troops", "pupil", "waiter", "troop", "pianist", "bodyguard", "biker", "choir", "surgeon", "men", "maid", "quarterback", "model", "officer", "mermaid", "teacher", "stepmother", "lecturer", "thief", "student", "reporter", "librarian", "nurse", "stylist", "realtor", "boy", "operator", "therapist", "platoon", "mother", "ladies", "men", "wife", "husband", "wives", "husbands", "nomad", "proprietor", "steward")
  104. # shortened words which may otherwise appear twice
  105. shortened <- c("rhino", "limo", "chimp", "scuba", "bike")
  106. # alternate versions of the same word, e.g. tomb and tombstone (plurals are okay, but this prevents repeating a word, e.g. 'spice' and 'spices')
  107. alternates <- c("spices", "tomb", "nostril", "needles", "chips", "levers", "motorbike", "peanuts", "trucks", "stairway", "meteor", "stair", "snails", "boots", "kitty", "piggy", "liqueur", "weed", "tummy", "ropes", "bumblebee")
  108. # words which I think have high concreteness because they are common misspellings of concrete words
  109. misspellings <- c("canon")
  110. # modal names for images clearly incorrectly named
  111. incorrects <- c("nut", "trumpet", "tuba", "spinach")
  112. # plural words (as all images are single objects)
  113. plurals <- c("sticks", "bees", "buttons", "strings", "ceramics", "tables", "arms", "molasses", "cereals", "mice", "brushes")
  114. # incongruent matches which have been excluded as they were possible descriptions for their image
  115. plausible <- c(
  116. "rubber", # was a match for "statue", which could also be a small rubber
  117. "buck", # was a match for a picture of a book (homophone)
  118. "marrow", # was a match for pickle
  119. "logo", # just about anything could be a logo
  120. "kit" # was a match for a baseball cap
  121. )
  122. stim <- boss %>%
  123. filter(
  124. (is_pic == "yes" & dupe_pref) | is_pic=="no",
  125. modal_name %in% swow$cue,
  126. PK.Brysbaert >= 0.9,
  127. PoS.SUBTLEX_UK == "noun",
  128. CNC.Brysbaert >= 4,
  129. !modal_name %in% c(americanisms, inappropriate, unimageable, people, shortened, alternates, misspellings, incorrects, plurals, plausible)
  130. ) %>%
  131. mutate(is_pic = as.factor(is_pic)) %>%
  132. rename(string = "modal_name") %>%
  133. split_by(is_pic, "yes" ~ "no") %>%
  134. # -0.5:0 (similarity of between 0.5 and 1) seems to give high similarity
  135. # -1:-0.99 (similarity of between 0 and 0.01) gives suitably dissimilar
  136. control_for_map(cos_sim, string, -1:-0.99, name = "cos_ppmi_sim") %>%
  137. control_for_map(maximise_lev_dist, string, 1:1, name = "max_lev_dist") %>%
  138. control_for(Length, 0:0) %>%
  139. control_for(CNC.Brysbaert, -0.25:0.25) %>%
  140. control_for(ON.OLD20, -0.75:0.75) %>%
  141. control_for(Zipf.SUBTLEX_UK, -0.125:0.125) %>%
  142. control_for(BG.SUBTLEX_UK, -0.0025:0.0025) %>%
  143. generate(200, seed = stim_seed)
  144. write_csv(stim, "stim.csv")
  145. stim_lev_dists <- stim %>%
  146. rowwise() %>%
  147. mutate(levenshtein_distance = stringdist(A1, A2, method="lv")) %>%
  148. ungroup() %>%
  149. select(item_nr, levenshtein_distance)
  150. stim_tidy <- stim %>%
  151. long_format("all") %>%
  152. rename(perc_name_agree = "perc_name_agree_denom_fq_inputs") %>%
  153. pivot_wider(id_cols = item_nr, names_from = condition, values_from = c(string, filename, perc_name_agree, Zipf.SUBTLEX_UK, Length, cos_ppmi_sim, CNC.Brysbaert, BG.SUBTLEX_UK, nb_diff_names, ON.OLD20)) %>%
  154. select(item_nr, string_A1, filename_A1, perc_name_agree_A1, string_A2, everything(), -filename_A2, -perc_name_agree_A2) %>%
  155. left_join(stim_lev_dists, by="item_nr")
  156. # optimise split for counterbalancing -------------------------------------
  157. message("Optimising counterbalance split")
  158. vars_to_match <- c("perc_name_agree_A1", "Zipf.SUBTLEX_UK_A1", "Zipf.SUBTLEX_UK_A2", "cos_ppmi_sim_A1", "cos_ppmi_sim_A2", "BG.SUBTLEX_UK_A1", "BG.SUBTLEX_UK_A2", "CNC.Brysbaert_A1", "CNC.Brysbaert_A2", "ON.OLD20_A1", "ON.OLD20_A2", "Length_A1", "Length_A2")
  159. # how many seeds to try
  160. max_seed <- 50000
  161. cl <- makeCluster(n_cores)
  162. cl_packages <- clusterEvalQ(cl, {library(dplyr); library(purrr); library(overlapping)})
  163. clusterExport(cl, c("stim_tidy", "vars_to_match"))
  164. dist_overlaps <- parLapply(cl, 1:max_seed, function(seed_i) {
  165. cat(sprintf("%s \r", seed_i))
  166. set.seed(seed_i)
  167. order_grp_vec <- sample(rep(c(1, 2), 100), 200, replace = FALSE)
  168. stim_seed_i <- mutate(stim_tidy, order_grp = order_grp_vec)
  169. tibble(
  170. seed_i = seed_i,
  171. var_j = vars_to_match,
  172. ov = map_dbl(vars_to_match, function(var_j) {
  173. var_grp_1 <- stim_seed_i %>%
  174. filter(order_grp == 1) %>%
  175. pull(var_j) %>%
  176. as.numeric()
  177. var_grp_2 <- stim_seed_i %>%
  178. filter(order_grp == 2) %>%
  179. pull(var_j) %>%
  180. as.numeric()
  181. ov <- overlapping::overlap(list(var_grp_1, var_grp_2))
  182. ov$OV %>%
  183. unname() %>%
  184. as.numeric()
  185. })
  186. )
  187. }) %>%
  188. reduce(bind_rows)
  189. stopCluster(cl)
  190. dist_overlaps_summ <- dist_overlaps %>%
  191. group_by(seed_i) %>%
  192. summarise(sum = sum(ov), median = median(ov), mean = mean(ov), sd = sd(ov), min = min(ov)) %>%
  193. arrange(desc(mean), sd)
  194. # apply the split ---------------------------------------------------------
  195. dist_overlaps_summ %>%
  196. pull(seed_i) %>%
  197. first() %>%
  198. set.seed()
  199. order_grp_vec <- sample(rep(c(1, 2), 100), 200, replace = FALSE)
  200. stim_tidy <- mutate(stim_tidy, order_grp = order_grp_vec)
  201. write_csv(stim_tidy, "stim_tidy.csv")
  202. stim_tidy_long <- stim %>%
  203. long_format("all") %>%
  204. arrange(item_nr) %>%
  205. mutate(
  206. filename = ifelse(is.na(filename), lag(filename), filename),
  207. cos_ppmi_sim_tidy = as.numeric(ifelse(cos_ppmi_sim=="1", NA, cos_ppmi_sim))
  208. ) %>%
  209. rename(perc_name_agree = "perc_name_agree_denom_fq_inputs") %>%
  210. # also store the order group
  211. full_join(select(stim_tidy, item_nr, order_grp), by = "item_nr")
  212. write_csv(stim_tidy_long, "stim_tidy_long.csv")
  213. # select practice trials --------------------------------------------------
  214. message("Selecting practice trials")
  215. prac_seed <- 111
  216. set.seed(prac_seed)
  217. A1_trials <- sample(1:20, 10)
  218. practice_stim <- boss %>%
  219. filter(
  220. (is_pic == "yes" & dupe_pref) | is_pic=="no",
  221. modal_name %in% swow$cue,
  222. PK.Brysbaert >= 0.9,
  223. PoS.SUBTLEX_UK == "noun",
  224. CNC.Brysbaert >= 4,
  225. !modal_name %in% c(americanisms, inappropriate, unimageable, people, shortened, alternates, misspellings, incorrects, plurals, plausible),
  226. !modal_name %in% stim_tidy_long$string
  227. ) %>%
  228. mutate(is_pic = as.factor(is_pic)) %>%
  229. rename(string = "modal_name") %>%
  230. split_by(is_pic, "yes" ~ "no") %>%
  231. # -0.5:0 (similarity of between 0.5 and 1) seems to give high similarity
  232. # -1:-0.999 (similarity of between 0 and 0.001) gives suitably disimilar
  233. control_for_map(cos_sim, string, -1:-0.99, name = "cos_ppmi_sim") %>%
  234. control_for_map(maximise_lev_dist, string, 1:1, name = "max_lev_dist") %>%
  235. control_for(Length, 0:0) %>%
  236. generate(20, seed = prac_seed)
  237. practice_stim_long <- practice_stim %>%
  238. long_format("all") %>%
  239. mutate(filename = ifelse(is.na(filename), lag(filename), filename)) %>%
  240. # randomly allocate half the items to congruent condition, and half to incongruent, and only keep the relevant trials
  241. filter( (condition=="A1" & item_nr %in% A1_trials) | (condition=="A2" & !item_nr %in% A1_trials) )
  242. # save the practice trials
  243. write_csv(practice_stim_long, "practice_stim.csv")
  244. # tidy picture-word stimuli figures ---------------------------------------
  245. itemwise_vars_for_dens <- c("Length", "Zipf.SUBTLEX_UK", "BG.SUBTLEX_UK", "CNC.Brysbaert", "ON.OLD20")
  246. distwise_vars_for_dens <- c("perc_name_agree", "cos_ppmi_sim_tidy")
  247. # plt_cols <- c("#007EFF", "#FF007E")
  248. plt_cols <- c("#E69F00", "#56B4E9")
  249. counterbalanced_grps_label <- c("Set 1", "Set 2")
  250. fontsize <- 12
  251. # Format data
  252. plt_stim_itemwise <- stim_tidy_long %>%
  253. mutate(condition = sub("-", "\n", condition, fixed=TRUE)) %>%
  254. pivot_longer(cols=all_of(itemwise_vars_for_dens), names_to="plt_variable", values_to="plt_value") %>%
  255. mutate(condition_j = as.numeric(factor(condition))) %>%
  256. rowwise() %>%
  257. # add horizontal jitter to points and lines
  258. mutate(., condition_j = ifelse(condition=="A1", condition_j + runif(1, 0.1, 0.15), condition_j + runif(1, -0.15, -0.1))) %>%
  259. # mutate(., condition_j = ifelse(condition=="A1", condition_j + 0.15, condition_j -0.15)) %>%
  260. ungroup() %>%
  261. mutate(
  262. condition = ifelse(condition=="A1", "Picture\nCongruent", "Picture\nIncongruent"),
  263. plt_variable = factor(plt_variable, levels = itemwise_vars_for_dens),
  264. plt_variable = fct_recode(
  265. plt_variable,
  266. `Length` = "Length",
  267. `Zipf Frequency` = "Zipf.SUBTLEX_UK",
  268. `Bigram Probability` = "BG.SUBTLEX_UK",
  269. `Concreteness` = "CNC.Brysbaert",
  270. `OLD20` = "ON.OLD20"
  271. )
  272. )
  273. itemwise_plt <- plt_stim_itemwise %>%
  274. ggplot(aes(condition, plt_value, colour=factor(order_grp), fill=factor(order_grp))) +
  275. geom_half_violin(data=filter(plt_stim_itemwise, condition=="Picture\nCongruent"), alpha=0.5, trim=FALSE, side="l", position=position_identity(), show.legend=FALSE) +
  276. geom_half_violin(data=filter(plt_stim_itemwise, condition=="Picture\nIncongruent"), alpha=0.5, trim=FALSE, side="r", position=position_identity(), show.legend=FALSE) +
  277. geom_point(aes(x=condition_j), alpha=0.75, shape=16, size=0.8) +
  278. geom_line(aes(x=condition_j, group=item_nr), alpha=0.3) +
  279. scale_fill_manual(name = "Stimulus Set", labels = counterbalanced_grps_label, values = plt_cols) +
  280. scale_colour_manual(name = "Stimulus Set", labels = counterbalanced_grps_label, values = plt_cols, guide = guide_legend(override.aes = list(alpha = 1, size=2, linewidth=1))) +
  281. scale_y_continuous(breaks = scales::pretty_breaks(5)) +
  282. facet_wrap(vars(plt_variable), scales="free_y") +
  283. labs(
  284. x = "\nCongruency Condition\n",
  285. y = "Value",
  286. tag = "a"
  287. ) +
  288. theme_classic() +
  289. theme(
  290. legend.position = c(0.85, 0.15),
  291. text = element_text(size=fontsize),
  292. # plot.title = element_text(hjust=-0.1),
  293. strip.background = element_blank(),
  294. legend.margin = margin(),
  295. plot.margin = margin(3,3,3,8, unit="pt")
  296. )
  297. plt_stim_distwise <- stim_tidy_long %>%
  298. mutate(condition = sub("-", "\n", condition, fixed=TRUE)) %>%
  299. pivot_longer(cols=all_of(distwise_vars_for_dens), names_to="plt_variable", values_to="plt_value") %>%
  300. mutate(
  301. plt_variable = factor(plt_variable, levels = distwise_vars_for_dens),
  302. plt_variable = fct_recode(
  303. plt_variable,
  304. `Percentage of Name Agreement` = "perc_name_agree",
  305. `Cosine PPMI Semantic Similarity` = "cos_ppmi_sim_tidy"
  306. ),
  307. plt_variable = fct_rev(plt_variable)
  308. )
  309. custom_breaks <- function(x) { if (max(x) < 0.01) pretty(x, 3) else pretty(x, 5) }
  310. distwise_plt <- plt_stim_distwise %>%
  311. ggplot(aes(plt_value, colour=factor(order_grp), fill=factor(order_grp))) +
  312. geom_density(alpha=0.5, show.legend = FALSE) +
  313. facet_wrap(vars(plt_variable), scales="free") +
  314. labs(
  315. x = "Value",
  316. y = "Density",
  317. tag = "b"
  318. ) +
  319. scale_fill_manual(name = "Stimulus Set", labels = counterbalanced_grps_label, values = plt_cols) +
  320. scale_colour_manual(name = "Stimulus Set", labels = counterbalanced_grps_label, values = plt_cols) +
  321. scale_x_continuous(breaks = custom_breaks) +
  322. scale_y_continuous(breaks = scales::pretty_breaks(3)) +
  323. theme_classic() +
  324. theme(
  325. legend.position = "none",
  326. text = element_text(size=fontsize),
  327. # plot.title = element_text(hjust=-0.1),
  328. strip.background = element_blank(),
  329. legend.margin = margin(),
  330. plot.margin = margin(3,3,3,8, unit="pt")
  331. )
  332. stim_summ <- (itemwise_plt / distwise_plt) +
  333. plot_layout(
  334. heights = c(5.5, 1), widths = 6.5,
  335. guides = "keep"
  336. )
  337. ggsave(file.path("fig", "stimuli_summary.png"), stim_summ, height=5.5, width=6.5, device="png", type="cairo")
  338. ggsave(file.path("fig", "stimuli_summary.pdf"), stim_summ, height=5.5, width=6.5, device="pdf")
  339. # select localiser stimuli ------------------------------------------------
  340. message("Selecting localiser stimuli")
  341. # misspelt words in the prevalence norms
  342. prev_misspellings <- c("yogurt")
  343. # words which might not be technically words
  344. nonwords <- c("yippee")
  345. max_seed <- 500000
  346. set.seed(stim_seed)
  347. loc_seeds <- sample(1:2147483647, max_seed, replace=FALSE)
  348. match_vars <- c("Zipf.SUBTLEX_UK", "PREV.Brysbaert", "CNC.Brysbaert", "AoA.Kuperman", "VAL.Warriner", "AROU.Warriner", "DOM.Warriner", "ON.OLD20", "BG.SUBTLEX_UK", "Length", "RT.BLP", "Accuracy.BLP")
  349. loc_stim_pool <- lexops %>%
  350. filter(
  351. PK.Brysbaert >= 0.9,
  352. !is.na(Zipf.SUBTLEX_UK),
  353. !string %in% c(prev_misspellings, nonwords),
  354. ) %>%
  355. mutate(PoS.SUBTLEX_UK = as.character(PoS.SUBTLEX_UK)) %>%
  356. select(string, all_of(match_vars), PoS.SUBTLEX_UK)
  357. pos_cats <- unique(loc_stim_pool$PoS.SUBTLEX_UK)
  358. pos_match_vars <- sapply(pos_cats, function(x) sprintf("pos_%s", x))
  359. pos_dum_vals <- map_dfc(pos_cats, function(x) {
  360. as.numeric(loc_stim_pool$PoS.SUBTLEX_UK == x)
  361. }) %>%
  362. set_names(pos_match_vars)
  363. loc_stim_pool <- bind_cols(loc_stim_pool, pos_dum_vals)
  364. cl <- makeCluster(n_cores)
  365. cl_packages <- clusterEvalQ(cl, {library(dplyr); library(purrr); library(overlapping)})
  366. clusterExport(cl, c("loc_stim_pool", "match_vars", "pos_match_vars", "stim_tidy_long", "practice_stim_long", "loc_seeds"))
  367. loc_seeds_res <- parLapply(cl, 1:max_seed, function(seed_i) {
  368. set.seed(loc_seeds[seed_i])
  369. loc_stim_i <- loc_stim_pool %>%
  370. filter(
  371. !string %in% stim_tidy_long$string,
  372. !string %in% practice_stim_long$string
  373. ) %>%
  374. slice_sample(n = 100)
  375. ov_numeric <- map_dbl(match_vars, function(var_j) {
  376. pool_vals <- loc_stim_pool[[var_j]][!is.na(loc_stim_pool[[var_j]])]
  377. sample_vals <- loc_stim_i[[var_j]][!is.na(loc_stim_i[[var_j]])]
  378. ov <- overlapping::overlap(list(pool_vals, sample_vals))
  379. ov$OV %>%
  380. unname() %>%
  381. as.numeric()
  382. })
  383. ov_pos <- map_dbl(pos_match_vars, function(var_j) {
  384. pool_vals <- loc_stim_pool[[var_j]][!is.na(loc_stim_pool[[var_j]])]
  385. sample_vals <- loc_stim_i[[var_j]][!is.na(loc_stim_i[[var_j]])]
  386. ov <- overlapping::overlap(list(pool_vals, sample_vals))
  387. ov$OV %>%
  388. unname() %>%
  389. as.numeric()
  390. })
  391. bind_rows(
  392. tibble(
  393. seed = loc_seeds[seed_i],
  394. ov = ov_numeric,
  395. vars = match_vars,
  396. weight = 1
  397. ),
  398. tibble(
  399. seed = loc_seeds[seed_i],
  400. ov = ov_pos,
  401. vars = pos_match_vars,
  402. weight = 1/8
  403. )
  404. ) %>%
  405. mutate(
  406. weight = case_when(
  407. vars == "Length" ~ 3,
  408. vars == "Zipf.SUBTLEX_UK" ~ 2,
  409. TRUE ~ weight
  410. ),
  411. ov_w = ov * weight
  412. ) %>%
  413. group_by(seed) %>%
  414. summarise(ov_sum = sum(ov_w))
  415. }) %>%
  416. reduce(bind_rows)
  417. stopCluster(cl)
  418. best_seed <- loc_seeds_res %>%
  419. arrange(desc(ov_sum)) %>%
  420. top_n(1) %>%
  421. pull(seed)
  422. # recreate the best stimulus set
  423. set.seed(best_seed)
  424. loc_stim <- loc_stim_pool %>%
  425. filter(
  426. !string %in% stim_tidy_long$string,
  427. !string %in% practice_stim_long$string
  428. ) %>%
  429. slice_sample(n = 100)
  430. loc_stim_tidy <- loc_stim %>%
  431. rename(word=string) %>%
  432. mutate(
  433. item_nr = row_number()
  434. ) %>%
  435. select(item_nr, everything())
  436. write_csv(loc_stim, "localiser_stim.csv")
  437. # plot the distributions for localiser stim -------------------------------
  438. repres_cols <- c("#696867", "#5ba7d6")
  439. dens_pl <- bind_rows(
  440. mutate(loc_stim, type="Sample"),
  441. mutate(loc_stim_pool, type="Population")
  442. ) %>%
  443. pivot_longer(cols=c(match_vars[match_vars!="Length"]), names_to="var", values_to="value") %>%
  444. mutate(var_tidy = recode(
  445. var,
  446. Zipf.SUBTLEX_UK = "Zipf Frequency",
  447. BG.SUBTLEX_UK = "Bigram Probability",
  448. CNC.Brysbaert = "Concreteness",
  449. ON.OLD20 = "OLD20",
  450. VAL.Warriner = "Valence",
  451. AROU.Warriner = "Arousal",
  452. DOM.Warriner = "Dominance",
  453. AoA.Kuperman = "Age of Acquisition",
  454. PREV.Brysbaert = "Prevalence",
  455. RT.BLP = "Lexical Decision RT",
  456. Accuracy.BLP = "Lexical Decision Accuracy"
  457. )) %>%
  458. mutate(var_tidy = factor(var_tidy, levels=sort(unique(var_tidy)))) %>%
  459. ggplot(aes(value, fill=type, alpha=type)) +
  460. geom_density() +
  461. facet_wrap(vars(var_tidy), scales="free", nrow=4) +
  462. labs(
  463. x = "Value",
  464. y = "Density",
  465. tag = "a",
  466. fill = NULL
  467. ) +
  468. scale_fill_manual(values=repres_cols) +
  469. scale_alpha_manual(values=c(1, 0.5), guide=FALSE) +
  470. scale_x_continuous(guide = guide_axis(check.overlap = TRUE)) +
  471. theme_classic() +
  472. theme(
  473. legend.position = c(0.85, 0.1),
  474. axis.text.y=element_blank(),
  475. axis.ticks.y=element_blank(),
  476. strip.background = element_blank(),
  477. legend.margin = margin(),
  478. plot.margin = margin(3,3,3,3, unit="pt")
  479. )
  480. dat_length <- bind_rows(
  481. mutate(loc_stim, type="Sample"),
  482. mutate(loc_stim_pool, type="Population")
  483. ) %>%
  484. mutate(var="Length", value=as.numeric(Length))
  485. dat_pos <- bind_rows(
  486. loc_stim %>% mutate(type="Sample", max_n=nrow(.)),
  487. loc_stim_pool %>% mutate(type="Population", max_n=nrow(.))
  488. ) %>%
  489. mutate(var="Part of Speech", value=PoS.SUBTLEX_UK) %>%
  490. group_by(type, value, var) %>%
  491. summarise(prop = n()/max_n) %>%
  492. distinct() %>%
  493. filter(value %in% loc_stim$PoS.SUBTLEX_UK)
  494. length_pl <- dat_length %>%
  495. ggplot(aes(value)) +
  496. geom_histogram(aes(y = after_stat(count / sum(count))), position = "identity", binwidth=1, fill=repres_cols[[1]], alpha=1, data=filter(dat_length, type=="Population")) +
  497. geom_histogram(aes(y = after_stat(count / sum(count))), position = "identity", binwidth=1, fill=repres_cols[[2]], alpha=0.5, data=filter(dat_length, type=="Sample")) +
  498. facet_wrap(vars(var), scales="free") +
  499. labs(
  500. x = "",
  501. y = "Proportion",
  502. tag = "b"
  503. ) +
  504. theme_classic() +
  505. theme(
  506. legend.position = "none",
  507. legend.margin = margin(),
  508. plot.margin = margin(3,3,3,3, unit="pt"),
  509. strip.background = element_blank()
  510. )
  511. pos_pl <- dat_pos %>%
  512. ggplot(aes(value, prop)) +
  513. geom_col(position = "identity", fill=repres_cols[[1]], alpha=1, data=filter(dat_pos, type=="Population")) +
  514. geom_col(position = "identity", fill=repres_cols[[2]], alpha=0.5, data=filter(dat_pos, type=="Sample")) +
  515. facet_wrap(vars(var), scales="free") +
  516. labs(
  517. x = "",
  518. y = NULL
  519. ) +
  520. theme_classic() +
  521. theme(
  522. legend.position = "none",
  523. legend.margin = margin(),
  524. plot.margin = margin(3,3,3,3, unit="pt"),
  525. strip.background = element_blank()
  526. )
  527. # loc_stim_summ <- (dens_pl / (length_pl | pos_pl + xlab("Value") + theme(axis.title.x = element_text(hjust=-0.175)))) +
  528. # plot_layout(heights = c(5.8, 1.2), widths = c(6.5, 2.5, 4))
  529. loc_stim_summ <- dens_pl + length_pl + (pos_pl + xlab("Value") + theme(axis.title.x = element_text(hjust=0.18))) +
  530. plot_layout(
  531. heights = c(6, 1, 1),
  532. design = "
  533. AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
  534. BBBBBBBBBBCCCCCCCCCCCCCCCCCCCC"
  535. )
  536. # loc_stim_summ
  537. ggsave(file.path("fig", "localiser_stimuli_summary.png"), loc_stim_summ, height=5.5, width=6.5, device="png", type="cairo")
  538. ggsave(file.path("fig", "localiser_stimuli_summary.pdf"), loc_stim_summ, height=5.5, width=6.5, device="pdf")
  539. message("Stimuli done!")