JackEdTaylor
/
Picture-Word-N1


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632
							library(readr)
library(dplyr)
library(tidyr)
library(purrr)
library(forcats)
library(ggplot2)
library(stringi)
library(LexOPS)
library(ggExtra)
library(gghalves)
library(patchwork)
library(overlapping)
library(parallel)
library(stringdist)
n_cores <- 12

# import stimuli data -----------------------------------------------------

boss <- read_csv("boss.csv") %>%
  # lowercase words
  mutate(modal_name = stri_trans_tolower(modal_name)) %>%
  # percentages as numbers
  mutate_at(vars(dplyr::matches("perc")), function(x) as.numeric(sub("%", "", x))) %>%
  # no spaces
  filter(!grepl(" ", modal_name)) %>%
  # join to LexOPS
  full_join(lexops, by = c("modal_name" = "string")) %>%
  # make note of whether a picture's modal name
  mutate(is_pic = ifelse(is.na(filename), "no", "yes"))

swow <- read_csv("swow_ppmi.csv")


# useful functions for selecting stimuli ----------------------------------

# function to calculate cosine similarity
calc_cos <- function(a, b) {
  if (length(a) != length(b)) stop("Inconsistent vector lengths")
  sum(a * b) / sqrt(sum(a^2)*sum(b^2))
}

# function to calculate associative strength for SWOW
assoc_str <- function(cues_a, cues_b, .swow = swow, print = FALSE, sim_measure = "R123.Strength") {
  
  if (length(cues_a) != length(cues_b)) stop("Inconsistent vector lengths")
  
  print_perc_vals <- seq(0, 100, by = 0.1)
  
  print_iters <- if (print) {
    lapply(1:length(cues_a), function(i) {
      perc_done <- (i / length(cues_a)) * 100
      if (round(perc_done, 1) %in% print_perc_vals) {
        print_perc_vals <<- print_perc_vals[print_perc_vals != round(perc_done, 1)]
        i
      } else {
        NULL
      }
    })
  } else {
    c()
  }
  
  sapply(1:length(cues_a), function(i) {
    
    cues_a_i <- as.character(cues_a[[i]])
    cues_b_i <- as.character(cues_b[[i]])
    
    if (identical(cues_a_i, cues_b_i)) {
      cos_i <- if (cues_a_i %in% .swow$cue) 1 else NA
    } else if (!all(c(cues_a_i, cues_b_i) %in% .swow$cue)) {
      cos_i <- NA
    } else {
      cues_neighbours <- .swow %>%
        dplyr::filter(cue %in% c(cues_a_i, cues_b_i)) %>%
        dplyr::mutate(cue = dplyr::case_when(
          cue == cues_a_i ~ "cue_a",
          cue == cues_b_i ~ "cue_b"
        )) %>%
        select(cue, response, !!dplyr::sym(sim_measure)) %>%
        tidyr::pivot_wider(names_from = cue, values_from = !!dplyr::sym(sim_measure), names_prefix = "p_") %>%
        #dplyr::filter(!is.na(p_cue_a), !is.na(p_cue_b))
        tidyr::replace_na(list(p_cue_a = 0, p_cue_b = 0))
      
      cos_i <- calc_cos(cues_neighbours$p_cue_a, cues_neighbours$p_cue_b)
    }
    
    if (print & i %in% print_iters) {
      perc_done <- i / length(cues_a) * 100
      cat(sprintf("%i/%i (%.1f%%), \"%s\" ~ \"%s\" = %s\n", i, length(cues_a), round(perc_done, 1), cues_a_i, cues_b_i, cos_i))
    }
    
    cos_i
  })
}

# assoc_str(c("cat", "cat", "cat"), c("cat", "teacher", "jungle"))
# assoc_str(c("cat", "cat", "cat"), c("cat", "teacher", "jungle"), sim_measure = "ppmi")

# function to get semantic similarity for swow (direct neighbours)
cos_sim <- function(matches, target) {
  if (all(matches==target)) return(rep(1, length(matches)))
  assoc_str(rep(target, length(matches)), matches, sim_measure = "ppmi")
}

# cos_sim(c("cat", "teacher", "jungle", "dog"), "cat")
# x <- cos_sim(sample(lexops$string, 5000), "cat")

# function for maximising levenshtein distance, assuming that length is matched exactly
# (returns value of 1 if maximum distance possible, otherwise 0)
maximise_lev_dist <- function(xsource, targets) {
  dists <- stringdist(a=xsource, b=targets, method="lv")
  as.numeric(dists==nchar(xsource))
}


# LexOPS pipeline ---------------------------------------------------------

stim_seed <- 1

# words which are much more likely to be produced by Americans/Canadians
americanisms <- c("squash", "airplane", "candy", "bison", "buffalo", "sidewalk", "camper", "motel", "checkers", "store", "soccer", "trash", "burner", "boardwalk", "automobile", "cellphone", "tombstone", "mailbox", "panties", "cab", "yogurt", "pants")
# words which are inappropriate and distracting
inappropriate <- c("piss", "penis", "sperm", "cannabis", "poop", "breast", "marijuana", "blowjob", "boob", "porn", "pussy", "bosom", "vagina", "muff", "breasts", "ass", "pee", "rump", "vulva")
# words which are unimageable, or describe things whose images would be very different to those in the normed set (e.g. 'library', when there are no buildings in the set) or are not actually nouns
unimageable <- c("siding", "cemetery", "polo", "woods", "bowel", "lobby", "bladder", "library", "zoo", "volcano", "apartments", "pasture", "peroxide", "temple", "weep", "sunrise", "storm", "mosque", "indigo", "bookshop", "pharmacy", "bakery", "orphanage", "sunset", "meadow", "venom", "brewery", "campus", "bulletin", "canal", "ranch", "salon", "yoga", "canteen", "livestock", "mammal", "casino", "subway", "skies", "vaccine", "placenta", "landfill", "jump", "snapshot", "vault", "badminton", "vessel", "troll", "surface", "parasite", "asylum", "sowing", "barefoot", "swimming", "arcade", "town", "shaft", "shore", "birth", "catcher", "waltz", "stomp", "spa", "animal", "cattle", "inch", "jungle", "attire", "postage", "poultry", "station", "pregnancy", "beach", "waist", "midget", "burp", "skyline", "workplace", "judo", "primate", "yawn", "trench", "bum", "injury", "shipment", "karaoke", "workshop", "chapel", "orchard", "runway", "saloon", "womb", "laughter", "cinema", "insulin", "chlorine", "stream", "movie", "chord", "piles", "rainy", "bacteria", "ravine", "beast", "motorway", "monsoon", "terrace", "woodwind", "jail", "retina", "planet", "wetland", "morphine", "cafe", "siren", "plantation", "sunlight", "meter", "print", "mucus", "sleet", "visa", "narcotics", "steroids", "urine", "blink", "appliance", "turquoise", "voice", "racing", "uterus", "chime", "slap", "nod", "pet", "bottom", "nudge", "army", "apartment", "village", "nightclub", "morgue", "kick", "city", "cellar", "outdoors", "squint", "booty", "lab", "street", "streets", "shock", "massacre", "item", "yard", "carnival", "structure", "prairie", "nudity", "shred", "nursery", "belch", "tint", "kilometer", "kilogram", "beep", "opera", "pageant", "swatter", "booze", "wiring", "vomit", "rodeo", "surf", "buffet", "bite", "chant", "swine", "mansion", "mill", "shack", "cabin", "alley", "lump", "ballot", "supper", "material", "lobe", "steel", "venison", "beef", "butt", "liquor", "pop", "office", "museum", "lookout", "port", "vitamin", "tee", "church", "labyrinth", "utensil")
# words which describe people (no images of people in the normed set of images, only parts like 'hand')
people <- c("tailor", "landlord", "waitress", "lifeguard", "parent", "swimmer", "cashier", "actor", "vendor", "bride", "broker", "salesman", "sailor", "pilot", "pirate", "redhead", "tutor", "runner", "brunette", "adult", "wizard", "chaps", "lawyer", "beekeeper", "priest", "kids", "uncle", "jockey", "surfer", "tenant", "violinist", "jury", "babysitter", "milkman", "astronaut", "beggar", "infant", "child", "robber", "baby", "talker", "tradesman", "lad", "magician", "plumber", "captain", "shooter", "nun", "prostitute", "pastor", "chauffeur", "rapper", "assassin", "bachelor", "stewardess", "visitor", "chairman", "chairwoman", "buyer", "sons", "banker", "veteran", "niece", "twins", "lady", "skater", "wrestler", "actress", "birth", "slave", "bomber", "boxer", "musician", "inmate", "worker", "kid", "carpenter", "physician", "blacksmith", "poet", "tribe", "shopper", "pilgrim", "ambassador", "chemist", "citizen", "male", "analyst", "chorus", "caveman", "butler", "brigade", "publisher", "roommate", "jogging", "mistress", "writer", "accountant", "mum", "attendant", "seaman", "monarch", "audience", "umpire", "brother", "typist", "scout", "assistant", "witch", "singer", "superhero", "troops", "pupil", "waiter", "troop", "pianist", "bodyguard", "biker", "choir", "surgeon", "men", "maid", "quarterback", "model", "officer", "mermaid", "teacher", "stepmother", "lecturer", "thief", "student", "reporter", "librarian", "nurse", "stylist", "realtor", "boy", "operator", "therapist", "platoon", "mother", "ladies", "men", "wife", "husband", "wives", "husbands", "nomad", "proprietor", "steward")
# shortened words which may otherwise appear twice
shortened <- c("rhino", "limo", "chimp", "scuba", "bike")
# alternate versions of the same word, e.g. tomb and tombstone (plurals are okay, but this prevents repeating a word, e.g. 'spice' and 'spices')
alternates <- c("spices", "tomb", "nostril", "needles", "chips", "levers", "motorbike", "peanuts", "trucks", "stairway", "meteor", "stair", "snails", "boots", "kitty", "piggy", "liqueur", "weed", "tummy", "ropes", "bumblebee")
# words which I think have high concreteness because they are common misspellings of concrete words
misspellings <- c("canon")
# modal names for images clearly incorrectly named
incorrects <- c("nut", "trumpet", "tuba", "spinach")
# plural words (as all images are single objects)
plurals <- c("sticks", "bees", "buttons", "strings", "ceramics", "tables", "arms", "molasses", "cereals", "mice", "brushes")
# incongruent matches which have been excluded as they were possible descriptions for their image
plausible <- c(
  "rubber",  # was a match for "statue", which could also be a small rubber
  "buck",  # was a match for a picture of a book (homophone)
  "marrow",  # was a match for pickle
  "logo",  # just about anything could be a logo
  "kit"  # was a match for a baseball cap
)

stim <- boss %>%
  filter(
    (is_pic == "yes" & dupe_pref) | is_pic=="no",
    modal_name %in% swow$cue,
    PK.Brysbaert >= 0.9,
    PoS.SUBTLEX_UK == "noun",
    CNC.Brysbaert >= 4,
    !modal_name %in% c(americanisms, inappropriate, unimageable, people, shortened, alternates, misspellings, incorrects, plurals, plausible)
  ) %>%
  mutate(is_pic = as.factor(is_pic)) %>%
  rename(string = "modal_name") %>%
  split_by(is_pic, "yes" ~ "no") %>%
  # -0.5:0 (similarity of between 0.5 and 1) seems to give high similarity
  # -1:-0.99 (similarity of between 0 and 0.01) gives suitably dissimilar
  control_for_map(cos_sim, string, -1:-0.99, name = "cos_ppmi_sim") %>%
  control_for_map(maximise_lev_dist, string, 1:1, name = "max_lev_dist") %>%
  control_for(Length, 0:0) %>%
  control_for(CNC.Brysbaert, -0.25:0.25) %>%
  control_for(ON.OLD20, -0.75:0.75) %>%
  control_for(Zipf.SUBTLEX_UK, -0.125:0.125) %>%
  control_for(BG.SUBTLEX_UK, -0.0025:0.0025) %>%
  generate(200, seed = stim_seed)

write_csv(stim, "stim.csv")

stim_lev_dists <- stim %>%
  rowwise() %>%
  mutate(levenshtein_distance = stringdist(A1, A2, method="lv")) %>%
  ungroup() %>%
  select(item_nr, levenshtein_distance)

stim_tidy <- stim %>%
  long_format("all") %>%
  rename(perc_name_agree = "perc_name_agree_denom_fq_inputs") %>%
  pivot_wider(id_cols = item_nr, names_from = condition, values_from = c(string, filename, perc_name_agree, Zipf.SUBTLEX_UK, Length, cos_ppmi_sim, CNC.Brysbaert, BG.SUBTLEX_UK, nb_diff_names, ON.OLD20)) %>%
  select(item_nr, string_A1, filename_A1, perc_name_agree_A1, string_A2, everything(), -filename_A2, -perc_name_agree_A2) %>%
  left_join(stim_lev_dists, by="item_nr")


# optimise split for counterbalancing -------------------------------------

message("Optimising counterbalance split")

vars_to_match <- c("perc_name_agree_A1", "Zipf.SUBTLEX_UK_A1", "Zipf.SUBTLEX_UK_A2", "cos_ppmi_sim_A1", "cos_ppmi_sim_A2", "BG.SUBTLEX_UK_A1", "BG.SUBTLEX_UK_A2", "CNC.Brysbaert_A1", "CNC.Brysbaert_A2", "ON.OLD20_A1", "ON.OLD20_A2", "Length_A1", "Length_A2")

# how many seeds to try
max_seed <- 50000

cl <- makeCluster(n_cores)
cl_packages <- clusterEvalQ(cl, {library(dplyr); library(purrr); library(overlapping)})
clusterExport(cl, c("stim_tidy", "vars_to_match"))

dist_overlaps <- parLapply(cl, 1:max_seed, function(seed_i) {
  cat(sprintf("%s   \r", seed_i))
  set.seed(seed_i)
  order_grp_vec <- sample(rep(c(1, 2), 100), 200, replace = FALSE)
  stim_seed_i <- mutate(stim_tidy, order_grp = order_grp_vec)
  tibble(
    seed_i = seed_i,
    var_j = vars_to_match,
    ov = map_dbl(vars_to_match, function(var_j) {
      var_grp_1 <- stim_seed_i %>%
        filter(order_grp == 1) %>%
        pull(var_j) %>%
        as.numeric()
      var_grp_2 <- stim_seed_i %>%
        filter(order_grp == 2) %>%
        pull(var_j) %>%
        as.numeric()
      ov <- overlapping::overlap(list(var_grp_1, var_grp_2))
      ov$OV %>%
        unname() %>%
        as.numeric()
    })
  )
}) %>%
  reduce(bind_rows)

stopCluster(cl)

dist_overlaps_summ <- dist_overlaps %>%
  group_by(seed_i) %>%
  summarise(sum = sum(ov), median = median(ov), mean = mean(ov), sd = sd(ov), min = min(ov)) %>%
  arrange(desc(mean), sd)

# apply the split ---------------------------------------------------------

dist_overlaps_summ %>%
  pull(seed_i) %>%
  first() %>%
  set.seed()

order_grp_vec <- sample(rep(c(1, 2), 100), 200, replace = FALSE)
stim_tidy <- mutate(stim_tidy, order_grp = order_grp_vec)

write_csv(stim_tidy, "stim_tidy.csv")

stim_tidy_long <- stim %>%
  long_format("all") %>%
  arrange(item_nr) %>%
  mutate(
    filename = ifelse(is.na(filename), lag(filename), filename),
    cos_ppmi_sim_tidy = as.numeric(ifelse(cos_ppmi_sim=="1", NA, cos_ppmi_sim))
  ) %>%
  rename(perc_name_agree = "perc_name_agree_denom_fq_inputs") %>%
  # also store the order group
  full_join(select(stim_tidy, item_nr, order_grp), by = "item_nr")

write_csv(stim_tidy_long, "stim_tidy_long.csv")


# select practice trials --------------------------------------------------

message("Selecting practice trials")

prac_seed <- 111

set.seed(prac_seed)
A1_trials <- sample(1:20, 10)

practice_stim <- boss %>%
  filter(
    (is_pic == "yes" & dupe_pref) | is_pic=="no",
    modal_name %in% swow$cue,
    PK.Brysbaert >= 0.9,
    PoS.SUBTLEX_UK == "noun",
    CNC.Brysbaert >= 4,
    !modal_name %in% c(americanisms, inappropriate, unimageable, people, shortened, alternates, misspellings, incorrects, plurals, plausible),
    !modal_name %in% stim_tidy_long$string
  ) %>%
  mutate(is_pic = as.factor(is_pic)) %>%
  rename(string = "modal_name") %>%
  split_by(is_pic, "yes" ~ "no") %>%
  # -0.5:0 (similarity of between 0.5 and 1) seems to give high similarity
  # -1:-0.999 (similarity of between 0 and 0.001) gives suitably disimilar
  control_for_map(cos_sim, string, -1:-0.99, name = "cos_ppmi_sim") %>%
  control_for_map(maximise_lev_dist, string, 1:1, name = "max_lev_dist") %>%
  control_for(Length, 0:0) %>%
  generate(20, seed = prac_seed)

practice_stim_long <- practice_stim %>%
  long_format("all") %>%
  mutate(filename = ifelse(is.na(filename), lag(filename), filename)) %>%
  # randomly allocate half the items to congruent condition, and half to incongruent, and only keep the relevant trials
  filter( (condition=="A1" & item_nr %in% A1_trials) | (condition=="A2" & !item_nr %in% A1_trials) )

# save the practice trials
write_csv(practice_stim_long, "practice_stim.csv")


# tidy picture-word stimuli figures ---------------------------------------

itemwise_vars_for_dens <- c("Length", "Zipf.SUBTLEX_UK", "BG.SUBTLEX_UK", "CNC.Brysbaert", "ON.OLD20")
distwise_vars_for_dens <- c("perc_name_agree", "cos_ppmi_sim_tidy")
# plt_cols <- c("#007EFF", "#FF007E")
plt_cols <- c("#E69F00", "#56B4E9")
counterbalanced_grps_label <- c("Set 1", "Set 2")
fontsize <- 12

# Format data
plt_stim_itemwise <- stim_tidy_long %>%
  mutate(condition = sub("-", "\n", condition, fixed=TRUE)) %>%
  pivot_longer(cols=all_of(itemwise_vars_for_dens), names_to="plt_variable", values_to="plt_value") %>%
  mutate(condition_j = as.numeric(factor(condition))) %>%
  rowwise() %>%
  # add horizontal jitter to points and lines
  mutate(., condition_j = ifelse(condition=="A1", condition_j + runif(1, 0.1, 0.15), condition_j + runif(1, -0.15, -0.1))) %>%
  # mutate(., condition_j = ifelse(condition=="A1", condition_j + 0.15, condition_j -0.15)) %>%
  ungroup() %>%
  mutate(
    condition = ifelse(condition=="A1", "Picture\nCongruent", "Picture\nIncongruent"),
    plt_variable = factor(plt_variable, levels = itemwise_vars_for_dens),
    plt_variable = fct_recode(
      plt_variable,
      `Length` = "Length",
      `Zipf Frequency` = "Zipf.SUBTLEX_UK",
      `Bigram Probability` = "BG.SUBTLEX_UK",
      `Concreteness` = "CNC.Brysbaert",
      `OLD20` = "ON.OLD20"
    )
  )

itemwise_plt <- plt_stim_itemwise %>%
  ggplot(aes(condition, plt_value, colour=factor(order_grp), fill=factor(order_grp))) +
  geom_half_violin(data=filter(plt_stim_itemwise, condition=="Picture\nCongruent"), alpha=0.5, trim=FALSE, side="l", position=position_identity(), show.legend=FALSE) +
  geom_half_violin(data=filter(plt_stim_itemwise, condition=="Picture\nIncongruent"), alpha=0.5, trim=FALSE, side="r", position=position_identity(), show.legend=FALSE) +
  geom_point(aes(x=condition_j), alpha=0.75, shape=16, size=0.8) +
  geom_line(aes(x=condition_j, group=item_nr), alpha=0.3) +
  scale_fill_manual(name = "Stimulus Set", labels = counterbalanced_grps_label, values = plt_cols) +
  scale_colour_manual(name = "Stimulus Set", labels = counterbalanced_grps_label, values = plt_cols, guide = guide_legend(override.aes = list(alpha = 1, size=2, linewidth=1))) +
  scale_y_continuous(breaks = scales::pretty_breaks(5)) +
  facet_wrap(vars(plt_variable), scales="free_y") +
  labs(
    x = "\nCongruency Condition\n",
    y = "Value",
    tag = "a"
  ) +
  theme_classic() +
  theme(
    legend.position = c(0.85, 0.15),
    text = element_text(size=fontsize),
    # plot.title = element_text(hjust=-0.1),
    strip.background = element_blank(),
    legend.margin = margin(),
    plot.margin = margin(3,3,3,8, unit="pt")
  )

plt_stim_distwise <- stim_tidy_long %>%
  mutate(condition = sub("-", "\n", condition, fixed=TRUE)) %>%
  pivot_longer(cols=all_of(distwise_vars_for_dens), names_to="plt_variable", values_to="plt_value") %>%
  mutate(
    plt_variable = factor(plt_variable, levels = distwise_vars_for_dens),
    plt_variable = fct_recode(
      plt_variable,
      `Percentage of Name Agreement` = "perc_name_agree",
      `Cosine PPMI Semantic Similarity` = "cos_ppmi_sim_tidy"
    ),
    plt_variable = fct_rev(plt_variable)
  )

custom_breaks <- function(x) { if (max(x) < 0.01) pretty(x, 3) else pretty(x, 5) }

distwise_plt <- plt_stim_distwise %>%
  ggplot(aes(plt_value, colour=factor(order_grp), fill=factor(order_grp))) +
  geom_density(alpha=0.5, show.legend = FALSE) +
  facet_wrap(vars(plt_variable), scales="free") +
  labs(
    x = "Value",
    y = "Density",
    tag = "b"
  ) +
  scale_fill_manual(name = "Stimulus Set", labels = counterbalanced_grps_label, values = plt_cols) +
  scale_colour_manual(name = "Stimulus Set", labels = counterbalanced_grps_label, values = plt_cols) +
  scale_x_continuous(breaks = custom_breaks) +
  scale_y_continuous(breaks = scales::pretty_breaks(3)) +
  theme_classic() +
  theme(
    legend.position = "none",
    text = element_text(size=fontsize),
    # plot.title = element_text(hjust=-0.1),
    strip.background = element_blank(),
    legend.margin = margin(),
    plot.margin = margin(3,3,3,8, unit="pt")
  )

stim_summ <- (itemwise_plt / distwise_plt) +
  plot_layout(
    heights = c(5.5, 1), widths = 6.5,
    guides = "keep"
  )

ggsave(file.path("fig", "stimuli_summary.png"), stim_summ, height=5.5, width=6.5, device="png", type="cairo")
ggsave(file.path("fig", "stimuli_summary.pdf"), stim_summ, height=5.5, width=6.5, device="pdf")


# select localiser stimuli ------------------------------------------------

message("Selecting localiser stimuli")

# misspelt words in the prevalence norms
prev_misspellings <- c("yogurt")
# words which might not be technically words
nonwords <- c("yippee")

max_seed <- 500000

set.seed(stim_seed)
loc_seeds <- sample(1:2147483647, max_seed, replace=FALSE)

match_vars <- c("Zipf.SUBTLEX_UK", "PREV.Brysbaert", "CNC.Brysbaert", "AoA.Kuperman", "VAL.Warriner", "AROU.Warriner", "DOM.Warriner", "ON.OLD20", "BG.SUBTLEX_UK", "Length", "RT.BLP", "Accuracy.BLP")

loc_stim_pool <- lexops %>%
  filter(
    PK.Brysbaert >= 0.9,
    !is.na(Zipf.SUBTLEX_UK),
    !string %in% c(prev_misspellings, nonwords),
  ) %>%
  mutate(PoS.SUBTLEX_UK = as.character(PoS.SUBTLEX_UK)) %>%
  select(string, all_of(match_vars), PoS.SUBTLEX_UK)

pos_cats <- unique(loc_stim_pool$PoS.SUBTLEX_UK)
pos_match_vars <- sapply(pos_cats, function(x) sprintf("pos_%s", x))

pos_dum_vals <- map_dfc(pos_cats, function(x) {
  as.numeric(loc_stim_pool$PoS.SUBTLEX_UK == x)
}) %>%
  set_names(pos_match_vars)

loc_stim_pool <- bind_cols(loc_stim_pool, pos_dum_vals)

cl <- makeCluster(n_cores)
cl_packages <- clusterEvalQ(cl, {library(dplyr); library(purrr); library(overlapping)})
clusterExport(cl, c("loc_stim_pool", "match_vars", "pos_match_vars", "stim_tidy_long", "practice_stim_long", "loc_seeds"))

loc_seeds_res <- parLapply(cl, 1:max_seed, function(seed_i) {
  set.seed(loc_seeds[seed_i])
  loc_stim_i <- loc_stim_pool %>%
    filter(
      !string %in% stim_tidy_long$string,
      !string %in% practice_stim_long$string
    ) %>%
    slice_sample(n = 100)
  ov_numeric <- map_dbl(match_vars, function(var_j) {
    pool_vals <- loc_stim_pool[[var_j]][!is.na(loc_stim_pool[[var_j]])]
    sample_vals <- loc_stim_i[[var_j]][!is.na(loc_stim_i[[var_j]])]
    ov <- overlapping::overlap(list(pool_vals, sample_vals))
    ov$OV %>%
      unname() %>%
      as.numeric()
  })
  ov_pos <- map_dbl(pos_match_vars, function(var_j) {
    pool_vals <- loc_stim_pool[[var_j]][!is.na(loc_stim_pool[[var_j]])]
    sample_vals <- loc_stim_i[[var_j]][!is.na(loc_stim_i[[var_j]])]
    ov <- overlapping::overlap(list(pool_vals, sample_vals))
    ov$OV %>%
      unname() %>%
      as.numeric()
  })
  bind_rows(
    tibble(
      seed = loc_seeds[seed_i],
      ov = ov_numeric,
      vars = match_vars,
      weight = 1
    ),
    tibble(
      seed = loc_seeds[seed_i],
      ov = ov_pos,
      vars = pos_match_vars,
      weight = 1/8
    )
  ) %>%
    mutate(
      weight = case_when(
        vars == "Length" ~ 3,
        vars == "Zipf.SUBTLEX_UK" ~ 2,
        TRUE ~ weight
      ),
      ov_w = ov * weight
    ) %>%
    group_by(seed) %>%
    summarise(ov_sum = sum(ov_w))
}) %>%
  reduce(bind_rows)

stopCluster(cl)

best_seed <- loc_seeds_res %>%
  arrange(desc(ov_sum)) %>%
  top_n(1) %>%
  pull(seed)

# recreate the best stimulus set
set.seed(best_seed)
loc_stim <- loc_stim_pool %>%
  filter(
    !string %in% stim_tidy_long$string,
    !string %in% practice_stim_long$string
  ) %>%
  slice_sample(n = 100)

loc_stim_tidy <- loc_stim %>%
  rename(word=string) %>%
  mutate(
    item_nr = row_number()
  ) %>%
  select(item_nr, everything())

write_csv(loc_stim, "localiser_stim.csv")


# plot the distributions for localiser stim -------------------------------

repres_cols <- c("#696867", "#5ba7d6")

dens_pl <- bind_rows(
  mutate(loc_stim, type="Sample"),
  mutate(loc_stim_pool, type="Population")
) %>%
  pivot_longer(cols=c(match_vars[match_vars!="Length"]), names_to="var", values_to="value") %>%
  mutate(var_tidy = recode(
    var,
    Zipf.SUBTLEX_UK = "Zipf Frequency",
    BG.SUBTLEX_UK = "Bigram Probability",
    CNC.Brysbaert = "Concreteness",
    ON.OLD20 = "OLD20",
    VAL.Warriner = "Valence",
    AROU.Warriner = "Arousal",
    DOM.Warriner = "Dominance",
    AoA.Kuperman = "Age of Acquisition",
    PREV.Brysbaert = "Prevalence",
    RT.BLP = "Lexical Decision RT",
    Accuracy.BLP = "Lexical Decision Accuracy"
  )) %>%
  mutate(var_tidy = factor(var_tidy, levels=sort(unique(var_tidy)))) %>%
  ggplot(aes(value, fill=type, alpha=type)) +
  geom_density() +
  facet_wrap(vars(var_tidy), scales="free", nrow=4) +
  labs(
    x = "Value",
    y = "Density",
    tag = "a",
    fill = NULL
  ) +
  scale_fill_manual(values=repres_cols) +
  scale_alpha_manual(values=c(1, 0.5), guide=FALSE) +
  scale_x_continuous(guide = guide_axis(check.overlap = TRUE)) +
  theme_classic() +
  theme(
    legend.position = c(0.85, 0.1),
    axis.text.y=element_blank(),
    axis.ticks.y=element_blank(),
    strip.background = element_blank(),
    legend.margin = margin(),
    plot.margin = margin(3,3,3,3, unit="pt")
  )

dat_length <- bind_rows(
  mutate(loc_stim, type="Sample"),
  mutate(loc_stim_pool, type="Population")
) %>%
  mutate(var="Length", value=as.numeric(Length))

dat_pos <- bind_rows(
  loc_stim %>% mutate(type="Sample", max_n=nrow(.)),
  loc_stim_pool %>% mutate(type="Population", max_n=nrow(.))
) %>%
  mutate(var="Part of Speech", value=PoS.SUBTLEX_UK) %>%
  group_by(type, value, var) %>%
  summarise(prop = n()/max_n) %>%
  distinct() %>%
  filter(value %in% loc_stim$PoS.SUBTLEX_UK)

length_pl <- dat_length %>%
  ggplot(aes(value)) +
  geom_histogram(aes(y = after_stat(count / sum(count))), position = "identity", binwidth=1, fill=repres_cols[[1]], alpha=1, data=filter(dat_length, type=="Population")) +
  geom_histogram(aes(y = after_stat(count / sum(count))), position = "identity", binwidth=1, fill=repres_cols[[2]], alpha=0.5, data=filter(dat_length, type=="Sample")) +
  facet_wrap(vars(var), scales="free") +
  labs(
    x = "",
    y = "Proportion",
    tag = "b"
  ) +
  theme_classic() +
  theme(
    legend.position = "none",
    legend.margin = margin(),
    plot.margin = margin(3,3,3,3, unit="pt"),
    strip.background = element_blank()
  )

pos_pl <- dat_pos %>%
  ggplot(aes(value, prop)) +
  geom_col(position = "identity", fill=repres_cols[[1]], alpha=1, data=filter(dat_pos, type=="Population")) +
  geom_col(position = "identity", fill=repres_cols[[2]], alpha=0.5, data=filter(dat_pos, type=="Sample")) +
  facet_wrap(vars(var), scales="free") +
  labs(
    x = "",
    y = NULL
  ) +
  theme_classic() +
  theme(
    legend.position = "none",
    legend.margin = margin(),
    plot.margin = margin(3,3,3,3, unit="pt"),
    strip.background = element_blank()
  )

# loc_stim_summ <- (dens_pl / (length_pl | pos_pl + xlab("Value") + theme(axis.title.x = element_text(hjust=-0.175)))) +
#   plot_layout(heights = c(5.8, 1.2), widths = c(6.5, 2.5, 4))

loc_stim_summ <- dens_pl + length_pl + (pos_pl + xlab("Value") + theme(axis.title.x = element_text(hjust=0.18))) +
  plot_layout(
    heights = c(6, 1, 1),
    design = "
    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
    BBBBBBBBBBCCCCCCCCCCCCCCCCCCCC"
  )

# loc_stim_summ

ggsave(file.path("fig", "localiser_stimuli_summary.png"), loc_stim_summ, height=5.5, width=6.5, device="png", type="cairo")
ggsave(file.path("fig", "localiser_stimuli_summary.pdf"), loc_stim_summ, height=5.5, width=6.5, device="pdf")


message("Stimuli done!")