MB-CDI

Purpose

This page documents the cleaning and merging procedures related to the MB-CDI data. The home visit workflow strips these files into their own set of CSVs under data/csv/home_visit/mbcdi.

The aggregate (across language group and age) data files are saved under data/csv/agg.

Preparation

source(file.path(here::here(), "R", "_OLD", "functions.R"))

purrr::walk(list.files(file.path(here::here(), "R"), "\\.R$", full.names = TRUE), source)

Let’s investigate the number of files, records, and variables per file.

mbcdi_fns <-
  list.files(file.path(here::here(), "data", "csv", "home_visit", "mbcdi"), "\\.csv$", full.names = TRUE)

length(mbcdi_fns)
## [1] 25
make_datafile_summary <- function(csv) {
  assertthat::is.string(csv)
  assertthat::is.readable(csv)
  
  df <-
    readr::read_csv(csv,
                    col_types = readr::cols(.default = 'c'),
                    show_col_types = FALSE)
  
  
  
  data.frame(fn = csv,
             age_group = extract_age_group_from_name(csv),
             lang_cond = form_language(csv),
             n_subs = dim(df)[1],
             n_vars = dim(df)[2])
}
mbcdi_file_dat <-
  purrr::map(mbcdi_fns, make_datafile_summary) |> purrr::list_rbind()

mbcdi_file_dat |>
  dplyr::arrange(age_group, lang_cond, n_subs) |>
  knitr::kable(format = 'html') |>
  kableExtra::kable_classic()
fn age_group lang_cond n_subs n_vars
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/334099_mbcdi_12_bilingual_english.csv 12 bilingual_english 1 730
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/411456_mbcdi_12_bilingual_english.csv 12 bilingual_english 1 717
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740623_mbcdi_12_bilingual_english.csv 12 bilingual_english 33 712
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/411469_mbcdi_12_bilingual_spanish.csv 12 bilingual_spanish 1 717
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740624_mbcdi_12_bilingual_spanish.csv 12 bilingual_spanish 1 712
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/331848_mbcdi_12_english.csv 12 english 4 256
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/363431_mbcdi_12_english.csv 12 english 10 255
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740625_mbcdi_12_english.csv 12 english 172 259
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/363466_mbcdi_18_bilingual_english.csv 18 bilingual_english 0 815
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740626_mbcdi_18_bilingual_english.csv 18 bilingual_english 47 829
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/411388_mbcdi_18_bilingual_spanish.csv 18 bilingual_spanish 0 815
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740627_mbcdi_18_bilingual_spanish.csv 18 bilingual_spanish 5 829
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/1136694_mbcdi_18_english.csv 18 english 0 359
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/1151489_mbcdi_18_english.csv 18 english 0 359
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/307736_mbcdi_18_english.csv 18 english 4 352
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/363349_mbcdi_18_english.csv 18 english 9 352
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740628_mbcdi_18_english.csv 18 english 157 358
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/363465_mbcdi_24_bilingual_english.csv 24 bilingual_english 0 815
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740631_mbcdi_24_bilingual_english.csv 24 bilingual_english 36 829
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/408149_mbcdi_24_bilingual_spanish.csv 24 bilingual_spanish 1 815
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740630_mbcdi_24_bilingual_spanish.csv 24 bilingual_spanish 2 829
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740631_mbcdi_24_bilingual_spanish.csv 24 bilingual_spanish 26 830
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/331453_mbcdi_24_english.csv 24 english 3 352
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/363381_mbcdi_24_english.csv 24 english 8 352
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740629_mbcdi_24_english.csv 24 english 133 358

12-mo-old English speakers

eng_12_files <- mbcdi_file_dat |>
  dplyr::filter(age_group == '12', lang_cond == 'english', n_subs > 0)
eng_12_combined_df <- purrr::map(eng_12_files$fn, mcdi_clean_12_csv) |>
  purrr::list_rbind()
eng_12_fn <- file.path(here::here(), "data", "csv", "home_visit", "agg", "mcdi_english_12_combined.csv")
readr::write_csv(eng_12_combined_df, eng_12_fn)

There are \(n=\) 186 participant records.

18-mo-old English speakers

This code should be wrapped in functions since many of the components duplicate one another.

eng_18_files <- mbcdi_file_dat |>
  dplyr::filter(age_group == '18', lang_cond == 'english', n_subs > 0)
eng_18_combined_df <- purrr::map(eng_18_files$fn, mcdi_clean_18_24_csv) |>
  purrr::list_rbind()
eng_18_fn <- file.path(here::here(), "data", "csv", "home_visit", "agg", "mcdi_english_18_combined.csv")
readr::write_csv(eng_18_combined_df, eng_18_fn)

There are \(n=\) 170 participant records.

24-mo-old English speakers

eng_24_files <- mbcdi_file_dat |>
  dplyr::filter(age_group == '24', lang_cond == 'english', n_subs > 0)
eng_24_combined_df <- purrr::map(eng_24_files$fn, mcdi_clean_18_24_csv) |>
  purrr::list_rbind()
eng_24_fn <- file.path(here::here(), "data", "csv", "home_visit", "agg", "mcdi_english_24_combined.csv")
readr::write_csv(eng_24_combined_df, eng_24_fn)

There are \(n=\) 170 participant records.

Old code

The following code is deprecated as of 2023-10-25, and is not run.

For simplicity, we’ll start with the youngest age group, and with the English speakers. There are \(n=3\) forms, with 4, 10, and 111 participants each, and 254, 253, and 257 variables.

eng_12_files <- mbcdi_file_dat |>
  dplyr::filter(age_group == '12', lang_cond == 'english', n_subs > 0)

We’ll examine the first one.

eng_12_331 <- readr::read_csv(eng_12_files$fn[1],
                    col_types = readr::cols(.default = 'c'),
                    show_col_types = FALSE)

names(eng_12_331) |> head()

Let’s try trimming the metadata labels.

eng_12_331_trim_names <- 
  names(eng_12_331) |> basename()

eng_12_331_trim_names |> head()

That looks better. Let’s look at the second file.

eng_12_363 <- readr::read_csv(eng_12_files$fn[2],
                    col_types = readr::cols(.default = 'c'),
                    show_col_types = FALSE)

names(eng_12_363) |> basename() |> head()

And the third one.

eng_12_740625 <- readr::read_csv(eng_12_files$fn[3],
                    col_types = readr::cols(.default = 'c'),
                    show_col_types = FALSE)

names(eng_12_740625) |> basename() |> head()

Now, we’ll create a function to clean the variable names.

select_basename <- function(csv_fn) {
  assertthat::is.string(csv_fn)
  assertthat::is.readable(csv_fn)
  
  df <- readr::read_csv(csv_fn,
                    col_types = readr::cols(.default = 'c'),
                    show_col_types = FALSE)
  
  names(df) <- basename(names(df))
  df
}
select_basename(eng_12_files$fn[3]) |> head()

Let’s trim unneeded fields. We’ll write several helper functions to do this.

trim_cdi_fields <- function(df) {
  df |>
    dplyr::select(-contains("note"),
                  -contains("instructions"),
                  -contains("comments"),
                  -contains("continue"),
                  -contains("vocab"),
                  -contains("mcdi"))
}

add_particip_index <- function(df) {
  df |> 
    dplyr::mutate(play_i = 1:dim(df)[1])
}

make_cdi_longer <- function(df) {
  n_vars <- dim(df)[2]
  df |>
    tidyr::pivot_longer(cols = 2:n_vars,
                        names_to = "word",
                        values_to = "understands_or_says") |>
    dplyr::filter(!is.na(understands_or_says)) |>
    dplyr::mutate(understands_or_says = stringr::str_replace(understands_or_says, "understands___", "says")) |>
    dplyr::mutate(
      understands_or_says = stringr::str_replace(understands_or_says, "understands_says", "says")) |>
    dplyr::mutate(word = stringr::str_replace(word, "mommy_001", "mommy")) |>
    dplyr::mutate(word = stringr::str_replace(word, "bath_001", "bath"))
}

Then we combine them into an omnibus function.

clean_cdi <- function(csv_fn) {
  select_basename(csv_fn) |>
    trim_cdi_fields() |>
    dplyr::rename("play_id" = "participant_id") |>
    make_cdi_longer() |>
    add_particip_index()
}

Now, we can run clean_cdi() across all three files.

eng_12 <- purrr::map(eng_12_files$fn, clean_cdi) |>
  purrr::list_rbind()
xtabs(~ word + understands_or_says, eng_12)

18-mo-old English speakers

Let’s move on to the 18-mo-old English speakers.

eng_18_files <- mbcdi_file_dat |>
  dplyr::filter(age_group == '18', lang_cond == 'english', n_subs > 0)

There are \(n=\) 3 files with participant data.

clean_cdi(eng_18_files$fn[1])
clean_cdi(eng_18_files$fn[2])
clean_cdi(eng_18_files$fn[3])

There are some duplicate entries for some words.

We need a strategy for reconciling these duplicates: candy, leg, rain, wet.

It’s not elegant, but I have one for modifying the duplicate names. See below.

24-mo-old English speakers

Let’s move on to the 24-mo-old English speakers.

eng_24_files <- mbcdi_file_dat |>
  dplyr::filter(age_group == '24', lang_cond == 'english', n_subs > 0)

There are \(n=\) 3 files with participant data.

Let’s see how the clean_cdi() works on one of these.

clean_cdi(eng_24_files$fn[1])

Once again, we have duplicates for several items: ‘candy’, ‘leg’, ‘rain’, ‘wet’.

It’s very hacky, but I think we might want to modify these item names until we figure out a better way to handle the duplicates.

modify_mcdi_dupes <- function(df, dupe = 'leg') {
  dup_index <- seq_along(df)[names(df) == dupe]
  for (i in 1:length(dup_index)) {
    this_dup <- dup_index[i]
    names(df)[this_dup] <- paste0(dupe, "_", i)
  }
  df
}

open_csv <- function(csv_fn) {
  assertthat::is.string(csv_fn)
  assertthat::is.readable(csv_fn)
  
  df <- readr::read_csv(csv_fn,
                    col_types = readr::cols(.default = 'c'),
                    show_col_types = FALSE)
}

trim_cdi_18_24_fields <- function(df) {
  df |>
    dplyr::select(-contains("note"),
                  -contains("instructions"),
                  -contains("comments"),
                  -contains("continue"),
                  -contains("vocab"),
                  -contains("mcdi"))
}

make_cdi_18_24_longer <- function(df) {
  n_vars <- dim(df)[2]
  df |>
    tidyr::pivot_longer(cols = 2:n_vars,
                        names_to = "word",
                        values_to = "knows")
 }

clean_cdi_18_24_dedupe <- function(csv_fn) {
  df <- open_csv(csv_fn)
  names(df) <- basename(names(df))
  
  df |>
    modify_mcdi_dupes(dupe = 'leg') |>
    modify_mcdi_dupes(dupe = 'candy') |>
    modify_mcdi_dupes(dupe = 'rain') |>
    modify_mcdi_dupes(dupe = 'wet') |>
    trim_cdi_18_24_fields() |>
    dplyr::rename("play_id" = "participant_id") |>
    make_cdi_18_24_longer()
}
eng_24 <- purrr::map(eng_24_files$fn, clean_cdi_18_24_dedupe) |>
  purrr::list_rbind()
xtabs(~ word + knows, eng_24)

Now, we can return to the 18-mo-old data to see if this works:

eng_18 <- purrr::map(eng_18_files$fn, clean_cdi_18_24_dedupe) |>
  purrr::list_rbind()
xtabs(~ word + knows, eng_18)

It does.