rm(list = ls(all.names = TRUE)) #will clear all objects includes hidden objects.
gc() #free up memrory and report the memory usage.
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 421808 22.6     878077 46.9   654177 35.0
## Vcells 815118  6.3    8388608 64.0  1770539 13.6

1 Data Preparation

1.1 Loading libraries

The following libraries and default settings were used during the analysis:

options(scipen = 999)

# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# BiocManager::install("survcomp")

library(tidyverse)
library(broom)
library(eNetXplorer)
library(glmnet)
library(olsrr)
library(caret)
library(pander)
library(tidymodels)
library("cowplot")


##parallel map

#library(parallel)  
library("furrr")
future::plan(multiprocess(workers = 16))

theme_set(theme_bw() + theme(panel.grid = element_blank()))

1.2 Loading the data

We first loaded all of the relevant data files (not shown here as they refer to local directories):

MRFINDINGS01 <-read.csv(paste0(dataFold, "ABCD_MRFINDINGS01_DATA_TABLE.csv")) %>% 
  filter(EVENTNAME =="baseline_year_1_arm_1") 
MRIQCRP102 <-read.csv(paste0(dataFold, "MRIQCRP102_DATA_TABLE.csv")) %>% 
  filter(EVENTNAME =="baseline_year_1_arm_1") 
MRIQCRP202 <-read.csv(paste0(dataFold, "MRIQCRP202_DATA_TABLE.csv")) %>% 
  filter(EVENTNAME =="baseline_year_1_arm_1") 
MRIQCRP302 <-read.csv(paste0(dataFold, "MRIQCRP302_DATA_TABLE.csv")) %>% 
  filter(EVENTNAME =="baseline_year_1_arm_1") 
FREESQC01 <-read.csv(paste0(dataFold, "FREESQC01_DATA_TABLE.csv")) %>% 
  filter(EVENTNAME =="baseline_year_1_arm_1") 
DMRIQC01 <-read.csv(paste0(dataFold, "DMRIQC01_DATA_TABLE.csv")) %>% 
  filter(EVENTNAME =="baseline_year_1_arm_1") 
NBackBeh <-read.csv(paste0(dataFold, "ABCD_MRINBACK02_DATA_TABLE.csv")) %>% 
  filter(EVENTNAME =="baseline_year_1_arm_1") 
NBackAparc <-read.csv(paste0(dataFold, "NBACK_BWROI02_DATA_TABLE.csv")) %>% 
  filter(EVENTNAME =="baseline_year_1_arm_1") 
NbackAsegDest <-read.csv(paste0(manipuFold, "NbackDestAsegReadableGgseg3d.csv")) 
# NbackAsegDestR1 <-read.csv(paste0(manipuFold, "NbackDestAsegReadableGgseg3dRunOne.csv"))
# NbackAsegDestR2 <-read.csv(paste0(manipuFold, "NbackDestAsegReadableGgseg3dRunTwo.csv")) 
MRIinfo <-tbl_df(read.csv(paste0(dataFold, "ABCD_MRI01_DATA_TABLE.csv"))) %>% 
  filter(EVENTNAME =="baseline_year_1_arm_1") 
Siteinfo <-tbl_df(read.csv(paste0(dataFold, "ABCD_LT01_DATA_TABLE.csv"))) %>% 
  filter(EVENTNAME =="baseline_year_1_arm_1") 
NIH_TB <-tbl_df(read.csv(paste0(dataFold,"ABCD_TBSS01_DATA_TABLE.csv"))) %>% 
  filter(EVENTNAME =="baseline_year_1_arm_1") 
LittleMan <-tbl_df(read.csv(paste0(dataFold,"LMTP201_DATA_TABLE.csv"))) %>% 
  filter(EVENTNAME =="baseline_year_1_arm_1") 
Pearson <-tbl_df(read.csv(paste0(dataFold,"ABCD_PS01_DATA_TABLE.csv"))) %>% 
  filter(EVENTNAME =="baseline_year_1_arm_1")

short_names <- tbl_df(read.csv(paste0(anotherFold,"ShortNames_all.csv") ))

MRIQcAll <- plyr::join_all(list(MRFINDINGS01,MRIQCRP102,
                          MRIQCRP202,MRIQCRP302,FREESQC01,DMRIQC01,
                          NBackBeh,NBackAparc,NbackAsegDest,MRIinfo,Siteinfo,NIH_TB,LittleMan,Pearson), 
                          by='SUBJECTKEY', type='full')

MRIQcAll <- MRIQcAll[,!duplicated(colnames(MRIQcAll))]

1.3 Quality control (QC)

Next, we included only the participants that passed the following QC:

MRIQcAll$NoIncidental <- ifelse((MRIQcAll$MRIF_SCORE== 3 | 
                                   MRIQcAll$MRIF_SCORE== 4 |
                                   MRIQcAll$MRIF_HYDROCEPHALUS == "yes"|         
                                   MRIQcAll$MRIF_HERNIATION == "yes"), 0, 1)
count(MRIQcAll,NoIncidental)
##   NoIncidental     n
## 1            0   451
## 2            1 11359
## 3           NA    65
MRIQcAll %>% count(IQC_T1_OK_SER)
##   IQC_T1_OK_SER     n
## 1             0    63
## 2             1 10553
## 3             2   870
## 4             3    97
## 5            NA   292
MRIQcAll %>% count(FSQC_QC)
##   FSQC_QC     n
## 1       0   462
## 2       1 11076
## 3      NA   337
MRIQcAll$T1FreeSurferQCOk <- ifelse((MRIQcAll$IQC_T1_OK_SER > 0 & 
                                       MRIQcAll$FSQC_QC == 1), 1, 0)

count(MRIQcAll,T1FreeSurferQCOk)
##   T1FreeSurferQCOk     n
## 1                0   524
## 2                1 11004
## 3               NA   347
MRIQcAll %>% count(IQC_NBACK_OK_SER>0)
##   IQC_NBACK_OK_SER > 0     n
## 1                FALSE   143
## 2                 TRUE 10045
## 3                   NA  1687
MRIQcAll %>% count(TFMRI_NBACK_BEH_PERFORMFLAG==1)
##   TFMRI_NBACK_BEH_PERFORMFLAG == 1    n
## 1                            FALSE 1464
## 2                             TRUE 8004
## 3                               NA 2407
MRIQcAll %>% count(TFMRI_NBACK_ALL_BETA_DOF>200)
##   TFMRI_NBACK_ALL_BETA_DOF > 200    n
## 1                          FALSE   33
## 2                           TRUE 8821
## 3                             NA 3021
MRIQcAll$NbackBehDofOk <- ifelse((MRIQcAll$IQC_NBACK_OK_SER>0 &
                                    MRIQcAll$TFMRI_NBACK_BEH_PERFORMFLAG ==1 &
                                    MRIQcAll$TFMRI_NBACK_ALL_BETA_DOF>200), 1, 0)
count(MRIQcAll,NbackBehDofOk)
##   NbackBehDofOk    n
## 1             0 1602
## 2             1 7439
## 3            NA 2834
MRIQcAll$AllNbackQc <- ifelse((MRIQcAll$NoIncidental == 1 & 
                                 MRIQcAll$T1FreeSurferQCOk == 1 & 
                                 MRIQcAll$NbackBehDofOk == 1), 1, 0)
count(MRIQcAll,AllNbackQc)
##   AllNbackQc    n
## 1          0 2399
## 2          1 6947
## 3         NA 2529
Nback.QCed <- MRIQcAll %>% filter(AllNbackQc == 1)

1.4 Remove Phillips

There was an issue was reported with the Philips scanners and it was recommended that the data from these scanners should be dropped. We did so:

Nback.QCed %>% count(MRI_INFO_MANUFACTURER)
##     MRI_INFO_MANUFACTURER    n
## 1      GE MEDICAL SYSTEMS 1353
## 2 Philips Medical Systems  868
## 3                 SIEMENS 4726
#remove phil and add beh during fMRI
Nback.QCedNoPhil <- Nback.QCed %>%
  filter(MRI_INFO_MANUFACTURER != 'Philips Medical Systems') 

#check how many variables in X2backVS0back and the list of ROIs
Nback.2backVS0back <- Nback.QCedNoPhil%>% select(.,starts_with("X2backvs0back"))
#colnames(Nback.2backVS0back)

Here are a list of responsevariable names which are used in the later analysis. Both short names ang long names are used in plotting.

Resp_Var <- c('TFMRI_NB_ALL_BEH_C2B_RATE',
              "NIHTBX_PICVOCAB_UNCORRECTED", 
              "NIHTBX_FLANKER_UNCORRECTED",
              "NIHTBX_LIST_UNCORRECTED",
              "NIHTBX_CARDSORT_UNCORRECTED",
              "NIHTBX_PATTERN_UNCORRECTED",
              "NIHTBX_PICTURE_UNCORRECTED",
              "NIHTBX_READING_UNCORRECTED",
              "LMT_SCR_PERC_CORRECT",
              "PEA_RAVLT_LD_TRIAL_VII_TC",
              "PEA_WISCV_TRS")
resp_var_plotting_long <- c("2-back working memory",
  "Picture vocabulary test",
  "Flanker test",
  "List sorting working memory",
  "Dimentional change card sort test",
  "Pattern comparison processing speed test",
  "Picture sequence memory test",
  "Oral reading recognition test",
  "Little man task correct percentage",
  "RAVLT long delay trial VII total correct",
  "WISC_V matrix reasoning total raw score"
)
resp_var_plotting_short <- c("2-back Work Mem","Pic Vocab","Flanker","List Work Mem","Cog Flex","Pattern Speed","Seq Memory","Reading Recog","Little Man","Audi Verbal","Matrix Reason")
resp_var_plotting <- tibble("response" = all_of(Resp_Var), "longer_name"=resp_var_plotting_long,"short_name"=resp_var_plotting_short)

subj_info <- c('SUBJECTKEY', 'MRI_INFO_DEVICESERIALNUMBER', 'SITE_ID_L')

data_all_average <- Nback.QCedNoPhil %>%
  select(SUBJECTKEY, all_of(subj_info), all_of(Resp_Var),
         starts_with('X2backvs0back')) %>%
  rename_at(vars(-all_of(subj_info),-all_of(Resp_Var)),
            ~ str_replace(., 'X2backvs0back_ROI_', 'roi_'))
### checking whether the short names and ROI names in the data are the same
name_check <- which(short_names$roi != str_remove(names(select(data_all_average,starts_with("roi_"))),"roi_"))
print(name_check)
## integer(0)
new_shorter_names <- short_names 
new_shorter_names$roiShort[97] <- "R Subcentral"
new_shorter_names$roiShort <-  str_squish(string = new_shorter_names$roiShort)
#new_shorter_names$roiShort <-  str_replace_all(string = new_shorter_names$roiShort, pattern = "\t", replacement = "")
### change ROI names to shorter ones (does not compat with formulas so not used)
#data_all_average <- data_all_average %>% rename_at(vars(starts_with("roi_")),~ new_shorter_names$roiShort)

1.5 Selecting data

We also performed listwise deletion and dropped all participants for whom either the behavioral performance or the activation across some brain area was greater than 3 * IQR (interquartile range).

data_all_listwise <- data_all_average %>%
  drop_na() 

IQR_remove <- function(data_split){
  data_split%>%
  mutate_at(vars(- all_of(subj_info)), ~ ifelse(
    .x > quantile(.x, na.rm = TRUE)[4] + 3 * IQR(.x, na.rm = TRUE) |
    .x < quantile(.x, na.rm = TRUE)[2] - 3 * IQR(.x, na.rm = TRUE), 
    NA, .x)) %>%
  drop_na() %>%
  mutate_if(is.double, ~ (.x - mean(.x)) / sd(.x)) 
}
  

n_all <- nrow(data_all_average)
n_listwise <- nrow(data_all_listwise)
n_diff_all_listwise <- nrow(data_all_average) - nrow(data_all_listwise)

The outliers are removed with respect to training and testing data set. So no count of columns is displayed here.

1.6 Check participant numbers across sites and scanners

Next we checked the number of participants across sites and scanners: Note that site 22 and site08 have fewer than 100 participants whcn IQR rules were applied.

nrow(data_all_listwise)
## [1] 5668
#26 scanners but 8 have fewer than 100 participants (as low as 3)
data_all_listwise  %>% count(MRI_INFO_DEVICESERIALNUMBER) %>% 
  summarize(`Number of scanners`=n()) %>%
  pander(split.cell = 80, split.table = Inf, justify = 'left')
Number of scanners
26
data_all_listwise  %>% 
  count(MRI_INFO_DEVICESERIALNUMBER) %>%
  arrange(n) %>% 
  rename(`Scanner ID` = MRI_INFO_DEVICESERIALNUMBER, `Number of participants` = n) %>%
  pander(split.cell = 80, split.table = Inf, justify = 'left')
Scanner ID Number of participants
HASHe76e6d72 6
HASH48f7cbc3 20
HASH31ce566d 32
HASHe3ce02d3 40
HASH7f91147d 59
HASH69f406fa 77
HASHfeb7e81a 84
HASHc9398971 129
HASH5b2fcf80 142
HASH4b0b8b05 169
HASH7911780b 174
HASHa3e45734 191
HASH65b39280 193
HASH03db707f 211
HASH311170b9 221
HASH4d1ed7b1 231
HASHc3bf3d9c 243
HASHd422be27 273
HASHd7cb4c6d 294
HASHb640a1b8 305
HASHe4f6957a 342
HASH11ad4ed5 350
HASH5b0cf1bb 359
HASH1314a204 370
HASH96a0c182 371
HASH3935c89e 782
#19 sites, but 2 sites are fewer than 100 -> use sites?
data_all_listwise  %>% 
  count(SITE_ID_L) %>% 
  summarize(`Number of sites`=n())  %>%
  pander(split.cell = 80, split.table = Inf, justify = 'left')
Number of sites
19
data_all_listwise  %>% 
  count(SITE_ID_L) %>% 
  arrange(n) %>% 
  rename(`Site ID` = SITE_ID_L, `Number of participants` = n) %>%
  pander(split.cell = 80, split.table = Inf, justify = 'left')
Site ID Number of participants
site22 20
site08 143
site15 174
site18 191
site07 193
site11 211
site05 221
site09 230
site04 253
site21 311
site13 320
site10 334
site03 359
site02 370
site06 371
site12 374
site20 402
site14 409
site16 782
# Remove site 22 and 8
data_all_listwise <- data_all_listwise %>%
  filter(SITE_ID_L != 'site22' & SITE_ID_L != 'site08')

1.7 Check the distribution of all of the cognitive tasks

Most are normally distributed.

#Look at distribution of NIHTBX_LIST_UNCORRECTED (i.e., working memory from Nback)
resp_names <-data_all_listwise %>% select(all_of(Resp_Var))%>%
  names()%>%
  set_names()

density_plot_grid <- resp_names %>%
  map(~ggplot(data_all_listwise,aes(x=.data[[.]])) 
      +stat_function(fun=dnorm, 
                color="skyblue", size = 1.5,
                args=list(mean=mean(data_all_listwise$.),
                          sd=sd(data_all_listwise$.))) +
  geom_density() +
  labs(x = NULL, y =NULL,title = resp_var_plotting$short_name[[which(resp_var_plotting$response==.)]])
)
title_density_plot <- ggdraw() + 
  draw_label(
    "Density plots of all the Cognitive Performance Variables",
    fontface = 'bold',
    x = 0,
    hjust = 0
  ) +
  theme(
    # add margin on the left of the drawing canvas,
    # so title is aligned with left edge of first plot
    plot.margin = margin(0, 0, 0, 7)
  )
plot_grid(title_density_plot,plot_grid(plotlist = density_plot_grid),nrow = 2 , rel_heights = c(0.1, 1))