##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 508773 27.2    1131714 60.5   643845 34.4
## Vcells 939865  7.2    8388608 64.0  1649066 12.6

0.1 Note

Here we the ABCD release 3.0 data-set

0.2 Loading libraries

The following libraries and default settings were used during the analysis:

options(scipen = 999)

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.6     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(broom)
library(eNetXplorer)
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-4
library(olsrr)
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
library(pander)
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v dials        0.1.1     v rsample      0.1.1
## v infer        1.0.0     v tune         0.2.0
## v modeldata    0.1.1     v workflows    0.2.6
## v parsnip      0.2.1     v workflowsets 0.2.1
## v recipes      0.2.0     v yardstick    0.0.9
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x Matrix::expand()  masks tidyr::expand()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x Matrix::pack()    masks tidyr::pack()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## x Matrix::unpack()  masks tidyr::unpack()
## x recipes::update() masks Matrix::update(), stats::update()
## * Search for functions across packages at https://www.tidymodels.org/find/
library("cowplot")
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:eNetXplorer':
## 
##     export
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
##parallel map
library("ggplot2")
library("furrr")
## Loading required package: future
library("party", quietly = TRUE)
## 
## Attaching package: 'modeltools'
## The following object is masked from 'package:tune':
## 
##     parameters
## The following object is masked from 'package:parsnip':
## 
##     fit
## The following object is masked from 'package:infer':
## 
##     fit
## The following object is masked from 'package:dials':
## 
##     parameters
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
## 
##     boundary
library("permimp")

theme_set(theme_bw() + theme(panel.grid = element_blank()))
## parallel processing number of cores register
all_cores <- parallel::detectCores(logical = FALSE) - 2

1 Data Preparation

1.1 Load data files

We first loaded all of the relevant data files (not shown here as they refer to local directories):

1.2 load all of the data after quality control

The aim is to compute the following:
structural mri: cortical thickness and subcortical value
resting state mri: Gordon network correlations
stop signal task (SST): any stop vs correct go contrast
MID (monetary incentive): reward vs neutral contrast
Nback task mri scan: 2 back vs 0 back contrast
DTI scan: FA from Hagler atlas

###loading site and scanner information
MRIinfo <-tibble::as_tibble(read.csv(paste0(datafolder, "ABCD_MRI01_DATA_TABLE.csv"))) 
Siteinfo <-tibble::as_tibble(read.csv(paste0(datafolder, "ABCD_LT01_DATA_TABLE.csv")))
MriandSite <- left_join(MRIinfo,Siteinfo, by=c('SUBJECTKEY','EVENTNAME'))
MRIinfo  %>% count(EVENTNAME,SEX)
## # A tibble: 4 x 3
##   EVENTNAME                SEX       n
##   <chr>                    <chr> <int>
## 1 2_year_follow_up_y_arm_1 F      2617
## 2 2_year_follow_up_y_arm_1 M      3076
## 3 baseline_year_1_arm_1    F      5631
## 4 baseline_year_1_arm_1    M      6161
###loading response variables
NIH_TB <-tibble::as_tibble(read.csv(paste0(datafolder,"ABCD_TBSS01_DATA_TABLE.csv"))) 
LittleMan <-tibble::as_tibble(read.csv(paste0(datafolder,"LMTP201_DATA_TABLE.csv"))) 
Pearson <-tibble::as_tibble(read.csv(paste0(datafolder,"ABCD_PS01_DATA_TABLE.csv"))) 
NBackBeh <- tibble::as_tibble(read.csv(paste0(datafolder, "ABCD_MRINBACK02_DATA_TABLE.csv")) ) 

###loading brain scan data
smri_data <- tibble::as_tibble(read.csv( paste0(datafold,"sMRIDestAsegReadableGgsegQCedWeighted.csv")))
Nback_data <- tibble::as_tibble(read.csv(paste0(datafold, "NBackDestAsegReadableGgseg3dQCed.csv")))
MID_data <- tibble::as_tibble(read.csv(paste0(datafold, "MIDDestAsegReadableGgseg3dQCed.csv")))
SST_data <- tibble::as_tibble(read.csv(paste0(datafold, "SSTDestAsegReadableGgseg3d_QCed.csv"))) 
rsmri_data <- tibble::as_tibble(read.csv(paste0(datafold, "RSBetNetExcludedNoCOMBATABCD3.csv")))
DTI_data <- tibble::as_tibble(read.csv(paste0(datafold, "DTA_FA23Tracks.csv")))
### Note that we have not removed NA before Enet tuning to keep the most number of participants

short_names <- tibble::as_tibble(read.csv(paste0(scriptfold,"ShortNames_all.csv") ))

### load vision scan data, according to the documents some of the participants cannot see the IPad screen
vision_idx <- tibble::as_tibble(read.table(paste0(NDAfold, "abcd_svs01.txt"),header = TRUE))

1.3 missing value for DTI

site 01, 17, 19 have no DTI

MriandSite %>% select(SUBJECTKEY,EVENTNAME, SITE_ID_L) %>%
  full_join(DTI_data, by = c('SUBJECTKEY','EVENTNAME')) %>%
  select(FA_R_fornix, SITE_ID_L) %>%
  group_by(SITE_ID_L) %>%
  naniar::miss_var_summary() %>%
  arrange(desc(pct_miss))
## # A tibble: 23 x 4
## # Groups:   SITE_ID_L [23]
##    SITE_ID_L variable    n_miss pct_miss
##    <chr>     <chr>        <int>    <dbl>
##  1 site17    FA_R_fornix    890    100  
##  2 site19    FA_R_fornix    840    100  
##  3 site01    FA_R_fornix    562     99.8
##  4 site08    FA_R_fornix    113     22.6
##  5 site13    FA_R_fornix    233     21.7
##  6 site22    FA_R_fornix      7     20.6
##  7 site10    FA_R_fornix    203     17.7
##  8 site04    FA_R_fornix    209     17.4
##  9 site18    FA_R_fornix     85     15.3
## 10 site15    FA_R_fornix    100     15.2
## # ... with 13 more rows

1.4 calculate partial correlation for resting state

We 1) kept the within-network z-transformed zero correlations 2) took the means of between-network z-transformed zero correlations 3) converted these Fisher z to r and rearranged them into a correlation matrix 4) transformed this matrix to “partial” correlations with cor2pcor 5) converted them back to Fisher z

library("corpcor")
ROI_names <- c("Auditory","CinguloOpercular","CinguloParietal","Default","DorsalAttention",
               "FrontoParietal","None","RetrosplenialTemporal","SensorimotorHand","SensorimotorMouth",
               "Salience","VentralAttention")
stepwise_sum <- function(i){
  return(sum(diff_seq[1:i]))
}
diff_seq = seq(from=11, to=1, by=-1)
diff_seq2=seq(from=11,to=2)+1
dim_vector <- dim(select(rsmri_data, starts_with("avg")))[2]
dim_matrix <- (-(-1)+sqrt(1-4*1*2*(-dim_vector)))/2

step_seq <- map_dbl(.x=seq(from=1, to=11,by =1),.f=stepwise_sum)
upp_seq<- append(12,step_seq+12)
low_seq <- append(1,map_dbl(.x=seq(from=1,to=10, by=1),.f=function(.x){
  return(sum(diff_seq2[1:.x])+1)
}))%>%append(.,78)

partialcor <- function(data_split){
  cor_RS <- select(data_split, starts_with("avg"))
  cor_RS<-DescTools::FisherZInv(cor_RS[,order(names(cor_RS))])
  avg_cor_array <- array(rep(NA,dim_matrix^2*dim(cor_RS)[1]),
                         dim = c(dim_matrix,dim_matrix,dim(cor_RS)[1]))
  Par_cor_array <- array(rep(NA,dim_matrix^2*dim(cor_RS)[1]),
                         dim = c(dim_matrix,dim_matrix,dim(cor_RS)[1]))
  
  row_to_matrix <- function(j){
    target=matrix(rep(1,dim_matrix^2),nrow=dim_matrix)
    for(i in 1:length(low_seq)){
      target[(i+1):13,i]=target[i,(i+1):13]=as.numeric(cor_RS[j,low_seq[i]:upp_seq[i]])
    }
    target[12,13]=target[13,12]=as.numeric(cor_RS[j,upp_seq[i]])
    return(target)
  }
  
  avg_cor_array <-map(.x=seq(from=1,to=dim(cor_RS)[1],by=1),.f=row_to_matrix)
  Par_cor_array <- avg_cor_array %>% map(.,cor2pcor)
  
  matrix_to_row <- function(j){
    target_vec<-rep(NA,dim(cor_RS)[2])
    for(i in 1:length(upp_seq)){
      target_vec[low_seq[i]:upp_seq[i]]<-as.vector(Par_cor_array[[j]][i,(i+1):13])
    }
    return(DescTools::FisherZ(target_vec)) 
  }
  
  Par_cor_RS_list <- map(.x=seq(from=1,to=dim(cor_RS)[1],by=1),
                         .f=matrix_to_row)
  Par_cor_RS_df <- as.data.frame(do.call(rbind,Par_cor_RS_list))
  colnames(Par_cor_RS_df)<-names(cor_RS)
  
  Par_cor_RS_df <- Par_cor_RS_df%>%
    rename_at(.vars = vars(contains("avg")),
              .funs = funs(sub("avg", "par", .)))%>%
    mutate(SUBJECTKEY =data_split$SUBJECTKEY,
           EVENTNAME=data_split$EVENTNAME)
  data_return <- left_join(Par_cor_RS_df,
                           data_split, 
                           by = c("SUBJECTKEY","EVENTNAME"))%>% 
    drop_na()
  return(data_return)
}
rsmri_par_data <- partialcor(data_split = rsmri_data)
## Registered S3 method overwritten by 'DescTools':
##   method         from  
##   reorder.factor gplots

1.5 Join all the data sets

vision_idx <- vision_idx[-1,]%>% mutate(SUBJECTKEY=subjectkey)%>%
                                mutate(EVENTNAME=eventname)

data_all <- plyr::join_all(list(smri_data,
                                MID_data,
                                rsmri_par_data,
                                Nback_data,
                                SST_data,DTI_data,
                                NBackBeh,
                                Pearson,
                                LittleMan,
                                NIH_TB,
                                MriandSite,
                                vision_idx), 
                           by=c('SUBJECTKEY','EVENTNAME'), type='full')
### full join all the data sets to keep all the participants after QC. Actually smri have the most number of participants
### there are only two participants that are not in the smri data set but in other modalities.
### change integer response variables into double for later scaling and IQR
data_all$PEA_RAVLT_LD_TRIAL_VII_TC <- as.double(data_all$PEA_RAVLT_LD_TRIAL_VII_TC)
data_all$PEA_WISCV_TRS <- as.double(data_all$PEA_WISCV_TRS)

1.5.1 filter out the participants that have problems with vision

54 subjects

data_removed <- data_all %>% filter(snellen_va_y == 0 | snellen_va_y == 1 | vis_flg == 2)

removed_subj <- data_removed$SUBJECTKEY
data_all <- data_all %>% filter(!SUBJECTKEY %in% removed_subj)

length(removed_subj)
## [1] 54

1.6 set up variable names

mri_vec <- c("Nback","SST","MID","rsmri","smri","DTI")

Resp_Var <- c('TFMRI_NB_ALL_BEH_C2B_RATE',
              "NIHTBX_PICVOCAB_UNCORRECTED", 
              "NIHTBX_FLANKER_UNCORRECTED",
              "NIHTBX_PATTERN_UNCORRECTED",
              "NIHTBX_PICTURE_UNCORRECTED",
              "NIHTBX_READING_UNCORRECTED",
              "LMT_SCR_PERC_CORRECT",
              "PEA_RAVLT_LD_TRIAL_VII_TC",
             "TFMRI_SST_ALL_BEH_TOTAL_ISSRT" )

resp_var_plotting_long <- c("2-back working memory",
  "Picture vocabulary",
  "Flanker",
  "Pattern comparison processing speed",
  "Picture sequence memory",
  "Oral reading recognition",
  "Little man task correct percentage",
  "RAVLT long delay trial VII total correct",
  "Stop Signal Reaction Time (integration)"
)

resp_var_plotting_short <- c("2-back Work Mem",
                             "Pic Vocab","Flanker",
                             "Pattern Speed",
                             "Seq Memory",
                             "Reading Recog",
                             "Little Man",
                             "Audi Verbal",
                             "integration SSRT")

resp_var_plotting <- tibble("response" =Resp_Var, 
                            "longer_name"=resp_var_plotting_long,
                            "short_name"=resp_var_plotting_short)

subj_info <- c('SUBJECTKEY', 'MRI_INFO_DEVICESERIALNUMBER', 'SITE_ID_L')

resp_names <- data_all %>% select(all_of(Resp_Var))%>%
  names()%>%
  set_names()

new_shorter_names <- short_names 
new_shorter_names$roiShort[97] <- "R Subcentral"
new_shorter_names$roiShort <-  str_squish(string = new_shorter_names$roiShort)

1.7 functions for data preparation

IQR and combat (to adjust batch effects for sMRI, rsfMRI and DTI)

# IQR function to remove data with 3 IQR rule
IQR_remove <- function(data_split){
  data_split%>%
  mutate_at(vars(- all_of(subj_info)), ~ ifelse(
    .x > quantile(.x, na.rm = TRUE)[4] + 3 * IQR(.x, na.rm = TRUE) |
    .x < quantile(.x, na.rm = TRUE)[2] - 3 * IQR(.x, na.rm = TRUE), 
    NA, .x)) %>%
  drop_na() %>%
  mutate_if(is.double, ~ (.x - mean(.x)) / sd(.x)) 
}

library("sva")
## Loading required package: mgcv
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.8-39. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
## 
## Attaching package: 'genefilter'
## The following objects are masked from 'package:yardstick':
## 
##     sens, spec
## The following object is masked from 'package:readr':
## 
##     spec
## Loading required package: BiocParallel
batch_adjust <- function(data_fold,resp, x,y){
  ols_data <- select(data_fold, resp,starts_with(x), starts_with(y))
  ols_matrix <- as.matrix(t(ols_data))
  dimnames(ols_matrix)<- NULL
  data_fold$SITE_ID_L <- as.factor(data_fold$SITE_ID_L)##have to be changed into factor before combat
 data_fold$SITE_ID_L <- droplevels(data_fold$SITE_ID_L)##drop the empty levels of a factor
 ols_matrix_com <- ComBat(dat = ols_matrix,batch = data_fold$SITE_ID_L)
 ols_data_com <- data.frame(t(ols_matrix_com))
 colnames(ols_data_com) <- colnames(ols_data)
 return(ols_data_com)
}

# if only one type of data is adjusted (e.g., DTI only requires FA but sMRI has two corticol and sub, rs has two within and bet networks)

batch_adjustOne <- function(data_fold,resp, x){
  ols_data <- select(data_fold, resp,starts_with(x))
  ols_matrix <- as.matrix(t(ols_data))
  dimnames(ols_matrix)<- NULL
  data_fold$SITE_ID_L <- as.factor(data_fold$SITE_ID_L)##have to be changed into factor before combat
 data_fold$SITE_ID_L <- droplevels(data_fold$SITE_ID_L)##drop the empty levels of a factor
 ols_matrix_com <- ComBat(dat = ols_matrix,batch = data_fold$SITE_ID_L)
 ols_data_com <- data.frame(t(ols_matrix_com))
 colnames(ols_data_com) <- colnames(ols_data)
 return(ols_data_com)
}

1.8 extract desired contrasts

## look at the contrast of MID and SST

contrast_vec_MID <- MID_data %>% 
  select(.,ends_with("_ROI_Left.Cerebellum.Cortex")) %>%
  colnames() %>%
  str_remove_all(.,"_ROI_Left.Cerebellum.Cortex")

contrast_vec_SST <- SST_data %>% 
  select(.,ends_with("_ROI_Left.Cerebellum.Cortex")) %>%
  colnames() %>%
  str_remove_all(.,"_ROI_Left.Cerebellum.Cortex")

## extract the contrasts for analysis
data_analysis <- data_all %>% 
  select(starts_with("X2backvs0back_ROI_"),
         starts_with("par"),
         starts_with("Within"),
         all_of(Resp_Var),
         all_of(subj_info),
         starts_with("antiRewVsNeu" ),
         starts_with("anystopvscorrectgo" ),
         starts_with("ASEG"),
         starts_with("Dest_Thick")) 

1.9 look at the distribution of the tasks

note 2nd time point doesn’t have “NIHTBX_LIST_UNCORRECTED” “NIHTBX_CARDSORT_UNCORRECTED” “PEA_WISCV_TRS”

      plotRes <- Resp_Var %>% map(.,~ggplot(data = data_all, 
                      mapping = aes_string(x = data_all[[.]], color = "EVENTNAME")) +
                      geom_density() + 
                      labs(x= resp_var_plotting$short_name[which(resp_var_plotting$response==.)]) +
                      labs(y= "") +
                      scale_color_discrete(name ="",
                             breaks=c("2_year_follow_up_y_arm_1", "baseline_year_1_arm_1"),
                             labels=c("Follow Up", "Baseline")) +
                      theme_classic()) 
    

ggpubr::ggarrange(plotlist=plotRes, common.legend = TRUE)
## Warning: Removed 3040 rows containing non-finite values (stat_density).
## Removed 3040 rows containing non-finite values (stat_density).
## Warning: Removed 428 rows containing non-finite values (stat_density).
## Warning: Removed 320 rows containing non-finite values (stat_density).
## Warning: Removed 371 rows containing non-finite values (stat_density).
## Warning: Removed 343 rows containing non-finite values (stat_density).
## Warning: Removed 455 rows containing non-finite values (stat_density).
## Warning: Removed 410 rows containing non-finite values (stat_density).
## Warning: Removed 474 rows containing non-finite values (stat_density).
## Warning: Removed 6659 rows containing non-finite values (stat_density).

2 Modeling for genearlisation across times for each cognitive task

2.1 set up baseline and followup participants

First we removed sites that have fewer than 100 participants. But none has fewer than 100.

The data structure of the analysis is comprised of four parts:
1) training set (1st layer training set): predicting cognition from each modality via the elastic net,

  1. validation set (2nd layer training set): combining predicted values from all modalities via the random forest,

  2. test set of the baseline period,

  3. test set of the second year follow up.

To protect data leakage, we select the subjects with the same subject key for the baseline test set and the followup test set.

site_count <- data_analysis %>% count(SITE_ID_L)

baseline_data <- data_all %>% filter(EVENTNAME == "baseline_year_1_arm_1")
followup_data <- data_all %>% filter(EVENTNAME == "2_year_follow_up_y_arm_1")

## all sites have more than 100!
baseline_analysis <- baseline_data %>% filter (SITE_ID_L != site_count$SITE_ID_L[which(site_count$n< 100)])
followup_analysis <- followup_data %>% filter (SITE_ID_L != site_count$SITE_ID_L[which(site_count$n< 100)])

subj_diff <- setdiff(baseline_analysis$SUBJECTKEY,followup_analysis$SUBJECTKEY)
baseline_trainvalid <- baseline_analysis %>% subset(.,SUBJECTKEY %in% subj_diff)
baseline_test <- baseline_analysis %>% subset(.,SUBJECTKEY %in% followup_analysis$SUBJECTKEY)

set.seed(123456)
### divide the training dataset into two
### one for enet tuning and one for random forest tuning
baseline_two_fold <-vfold_cv(data =baseline_trainvalid, v=2,repeats = 1)

# training set (1st layer training set): predicting cognition from each modality via the elastic net,  
dim(analysis(baseline_two_fold$splits[[1]]))[1]
## [1] 3041
# validation set (2nd layer training set): combining predicted values from all modalities via the random forest,  
dim(assessment(baseline_two_fold$splits[[1]]))[1]
## [1] 3042
# test set of the baseline period,   
dim(baseline_test)[1]
## [1] 5622
# test set of the second year follow up
dim(followup_analysis)[1]
## [1] 5656

2.2 Elastic net parameter tuning setup

2.2.1 set up the data and adjust for batch effects

Here we used Combat, an empirical Bayesian method, to adjust for batch effects (separately for the train and test). Because Nback, SST and MID task-based fMRI are contrast data, batch effects are negligible. And accordingly, we decided not to apply combat on the task-based fMRI, and only applied it on the resting state mri, structure mri and DTI. https://www.biorxiv.org/content/10.1101/309260v1.full.pdf

# for models that require no Combat
enet_processing_nocom <- 
  function(resp_var,
           split_train = analysis(baseline_two_fold$splits[[1]]),
           split_valid = assessment(baseline_two_fold$splits[[1]]),
           split_test = baseline_test,followup_test,contrast_name)
    {
              data_train <- split_train %>% 
                select(all_of(resp_var),
                       all_of(subj_info),
                       starts_with(contrast_name)) %>%
                drop_na() %>% 
                IQR_remove()
              
              data_valid <- split_valid %>% 
                select(all_of(resp_var),
                       all_of(subj_info),
                       starts_with(contrast_name)) %>% 
                drop_na() %>% 
                IQR_remove()
              
              baseline_test_set <- split_test %>% 
                select(all_of(resp_var),
                       all_of(subj_info),
                       starts_with(contrast_name)) %>% 
                drop_na() %>% 
                IQR_remove()
              
              followup_test_set <- followup_test %>% 
                select(all_of(resp_var),
                       all_of(subj_info),
                       starts_with(contrast_name)) %>%
                drop_na() %>% 
                IQR_remove()
              
              ols_data_train <- select(data_train, resp_var, starts_with(contrast_name))
              ols_data_valid <- select(data_valid,resp_var,starts_with(contrast_name))
              ols_test_baseline <- select(baseline_test_set,resp_var,starts_with(contrast_name))
              ols_test_followup <- select(followup_test_set,resp_var,starts_with(contrast_name))
    return(list(ols_data_train=ols_data_train,
              ols_data_valid=ols_data_valid,
              ols_test_baseline=ols_test_baseline,
              ols_test_followup=ols_test_followup,
              data_valid_id=tibble(SUBJECTKEY=data_valid$SUBJECTKEY),
              baseline_test_id=tibble(SUBJECTKEY=baseline_test_set$SUBJECTKEY),
              followup_test_id=tibble(SUBJECTKEY=followup_test_set$SUBJECTKEY),
              data_train_id=tibble(SUBJECTKEY=data_train$SUBJECTKEY)))
  }

# for models that require Combat
enet_processing_com <- 
  function(resp_var,
           split_train = analysis( baseline_two_fold$splits[[1]]),
           split_valid = assessment(baseline_two_fold$splits[[1]]),
           split_test=baseline_test,followup_test, measure_one,measure_two)
    {
    data_train <- split_train %>%
      select(all_of(resp_var),
             all_of(subj_info),
             starts_with(measure_one),
             starts_with(measure_two)) %>%
      drop_na() %>% 
      IQR_remove()
    
    data_valid <- split_valid %>%
      select(all_of(resp_var),
             all_of(subj_info),
             starts_with(measure_one),
             starts_with(measure_two)) %>% 
      drop_na()%>% 
      IQR_remove()
    
    baseline_test_set <- split_test%>%
      select(all_of(resp_var),
             all_of(subj_info),
             starts_with(measure_one),
             starts_with(measure_two))%>%
      drop_na()%>% 
      IQR_remove()
    
    followup_test_set <- followup_test%>%
      select(all_of(resp_var),
             all_of(subj_info),
             starts_with(measure_one),
             starts_with(measure_two))%>%
      drop_na()%>% 
      IQR_remove()
    
    ols_data_train <- 
      batch_adjust(data_fold = data_train,
                   resp = resp_var, 
                   x=measure_one, 
                   y=measure_two)
    
    ols_data_valid <- 
      batch_adjust(data_fold = data_valid,
                   resp = resp_var, 
                   x=measure_one, 
                   y=measure_two)
    
    ols_test_baseline <- 
      batch_adjust(data_fold = baseline_test_set,
                   resp = resp_var, 
                   x=measure_one, 
                   y=measure_two)
    
    ols_test_followup <- 
      batch_adjust(data_fold = followup_test_set,
                   resp = resp_var, 
                   x=measure_one, 
                   y=measure_two)
    
  return(list(ols_data_train=ols_data_train,
              ols_data_valid=ols_data_valid,
              ols_test_baseline=ols_test_baseline,
              ols_test_followup=ols_test_followup,
              data_valid_id=tibble(SUBJECTKEY=data_valid$SUBJECTKEY),
              baseline_test_id=tibble(SUBJECTKEY=baseline_test_set$SUBJECTKEY),
              followup_test_id=tibble(SUBJECTKEY=followup_test_set$SUBJECTKEY),
              data_train_id=tibble(SUBJECTKEY=data_train$SUBJECTKEY)))
  }

# for models that require Combat that has one type of data. so far this was used for DTI only. This is bc DTI only one prefix ("fa")
enet_processing_comOne <- 
  function(resp_var,
           split_train = analysis( baseline_two_fold$splits[[1]]),
           split_valid = assessment(baseline_two_fold$splits[[1]]),
           split_test=baseline_test,followup_test, measure_one)
    {
    data_train <- split_train %>%
      select(all_of(resp_var),
             all_of(subj_info),
             starts_with(measure_one)) %>%
      drop_na() %>% 
      IQR_remove()
    
    data_valid <- split_valid %>%
      select(all_of(resp_var),
             all_of(subj_info),
             starts_with(measure_one)) %>% 
      drop_na()%>% 
      IQR_remove()
    
    baseline_test_set <- split_test%>%
      select(all_of(resp_var),
             all_of(subj_info),
             starts_with(measure_one))%>%
      drop_na()%>% 
      IQR_remove()
    
    followup_test_set <- followup_test%>%
      select(all_of(resp_var),
             all_of(subj_info),
             starts_with(measure_one))%>%
      drop_na()%>% 
      IQR_remove()
    
    ols_data_train <- 
      batch_adjustOne(data_fold = data_train,
                   resp = resp_var, 
                   x=measure_one)
    
    ols_data_valid <- 
      batch_adjustOne(data_fold = data_valid,
                   resp = resp_var, 
                   x=measure_one)
    
    ols_test_baseline <- 
      batch_adjustOne(data_fold = baseline_test_set,
                   resp = resp_var, 
                   x=measure_one)
    
    ols_test_followup <- 
      batch_adjustOne(data_fold = followup_test_set,
                   resp = resp_var, 
                   x=measure_one)
    
  return(list(ols_data_train=ols_data_train,
              ols_data_valid=ols_data_valid,
              ols_test_baseline=ols_test_baseline,
              ols_test_followup=ols_test_followup,
              data_valid_id=tibble(SUBJECTKEY=data_valid$SUBJECTKEY),
              baseline_test_id=tibble(SUBJECTKEY=baseline_test_set$SUBJECTKEY),
              followup_test_id=tibble(SUBJECTKEY=followup_test_set$SUBJECTKEY),
              data_train_id=tibble(SUBJECTKEY=data_train$SUBJECTKEY)))
   }

2.2.2 a function for elastic net hyper parameter tuning and performance evaluation

this function will fitting enet models and to apply them the 2nd layer model and to the test sets

enet_tuning <- function(resp_var, data_list){
     #recipe train and test data are from ols as the variables are the same
norm_recipe <-recipe(as.formula(paste0(resp_var, "~ .")), 
                      data =data_list[["ols_data_train"]] ) %>%
              step_dummy(all_nominal()) %>%
              step_center(all_predictors()) %>%
              step_scale(all_predictors())  %>% 
              # estimate the means and standard deviations
              prep(training = data_list[["ols_data_train"]], retain = TRUE)
            ## model
              glmn_mod <- linear_reg(penalty = tune() ,mixture =tune()) %>% 
              set_engine("glmnet")
set.seed(123456)
# 10fold split within the training data
all_rs <- rsample::vfold_cv(data=data_list[["ols_data_train"]],
                            v = 10, repeats = 1)
glmn_wfl <- workflow() %>%
  add_recipe(norm_recipe) %>%
  add_model(glmn_mod)
## penalty is lambda, mixture is alpha
## log10 transformed to get lambda between 10^-10 to 10^1
## this expands the default values (10^-10 to 10^0) from tidymodels https://github.com/tidymodels/dials/issues/140
glmn_set <- dials::parameters(penalty(range = c(-10,1), trans = log10_trans()),mixture())
## 200 levels of lambda and 11 levels of alpha (0, .1 to 1) 
glmn_grid <- grid_regular(glmn_set, levels = c(200,11))
ctrl <- control_grid(save_pred = TRUE, verbose = TRUE)
glmn_tune <- tune_grid(glmn_wfl,
            resamples = all_rs,
            grid = glmn_grid,
            metrics = metric_set(mae),
            control = ctrl)

##select the best tuned lambda and alpha
best_glmn <- select_best(glmn_tune, metric = "mae")

## use the best tunes parameters to fit the test data
glmn_wfl_final <- 
  glmn_wfl %>%
  finalize_workflow(best_glmn) %>%
 parsnip::fit(data = data_list[["ols_data_train"]]) ##clearly another package is blocking this function

### this is the prediction for random forest tuning
pred_glmn <- predict(glmn_wfl_final, new_data =data_list[["ols_data_valid"]])

###this is the prediction for test set
pred_test_baseline <- predict(glmn_wfl_final, new_data =data_list[["ols_test_baseline"]])
pred_test_followup <- predict(glmn_wfl_final, new_data =data_list[["ols_test_followup"]])

##compute the correlation between predicted results and the test data 
performance_glmn <- cor(pred_glmn$.pred,data_list[["ols_data_valid"]][[resp_var]])
performance_baseline <- cor(pred_test_baseline$.pred,data_list[["ols_test_baseline"]][[resp_var]])
performance_followup <- cor(pred_test_followup$.pred,data_list[["ols_test_followup"]][[resp_var]])
best_glmn <- best_glmn %>% 
  mutate(performance=performance_glmn,
         baseline_performance = performance_baseline,
         followup_performance=performance_followup)

return(list("param_tune"=best_glmn, 
            "predicted"=tibble(SUBJECTKEY=data_list[["data_valid_id"]]$SUBJECTKEY, 
                               "pred"=pred_glmn$.pred),
            "predicted_baseline"=tibble(SUBJECTKEY=data_list[["baseline_test_id"]]$SUBJECTKEY, 
                                        "pred"=pred_test_baseline$.pred), 
            "predicted_followup"=tibble(SUBJECTKEY=data_list[["followup_test_id"]]$SUBJECTKEY, 
                                        "pred"=pred_test_followup$.pred),
            "obs_train"=length(data_list[["data_train_id"]]$SUBJECTKEY),
            "obs_valid"=length(data_list[["data_valid_id"]]$SUBJECTKEY)))
}

2.3 model fitting for each cognitive task

2.3.1 actual data preping for elastic net for each cognitive task

Elastic net tuning is saved together with random forest results.

Nback_enet_data <- resp_names %>% 
  map(.,~enet_processing_nocom(resp_var = .,contrast_name = "X2backvs0back_ROI_",followup_test = followup_analysis))

##change this code before redo the random forest
##the codes are changed but not run except for the first one
SST_enet_data <- resp_names %>% 
  map(.,~enet_processing_nocom(resp_var = .,
                             followup_test = followup_analysis,
                             contrast_name = "anystopvscorrectgo_ROI_"))

MID_enet_data <- resp_names %>% 
  map(.,~enet_processing_nocom(resp_var = .,
                               followup_test = followup_analysis,
                               contrast_name = "antiRewVsNeu_ROI_"))

DTI_enet_data <- resp_names %>% 
  map(.,~enet_processing_comOne(resp_var = .,
                                followup_test = followup_analysis,
                                measure_one = "FA_"))

smri_enet_data <- resp_names %>% 
  map(.,~enet_processing_com(resp_var = .,
                             followup_test = followup_analysis,
                             measure_one = "ASEG_",
                             measure_two = "Dest_Thick_"))

rsmri_enet_data <- resp_names %>% 
  map(.,~enet_processing_com(resp_var = .,
                             followup_test = followup_analysis,
                             measure_one = "par",
                             measure_two = "Within"))

enet_data_all <- list("Nback"=Nback_enet_data,
                      "SST"=SST_enet_data,
                      "MID"=MID_enet_data,
                      "rsmri"=rsmri_enet_data,
                      "smri"=smri_enet_data,
                      "DTI"=DTI_enet_data)

2.3.2 actual parameter tuning for elastic net for each cognitive task

##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)

Nback_results <- resp_names %>% 
  future_map(.,~enet_tuning(resp_var =.,
                            data_list = Nback_enet_data[[.]]),
             .options = furrr_options(seed=TRUE))

##change this code before redo the random forest
##the codes are changed but not run except for the first one
SST_results <- resp_names %>% 
  future_map(.,~enet_tuning(resp_var =.,
                            data_list = SST_enet_data[[.]]),
             .options = furrr_options(seed=TRUE))

MID_results <- resp_names %>% 
  future_map(.,~enet_tuning(resp_var =.,
                            data_list = MID_enet_data[[.]]),
             .options = furrr_options(seed=TRUE))

DTI_results <- resp_names %>% 
  future_map(.,~enet_tuning(resp_var =.
                            ,data_list = DTI_enet_data[[.]]),
             .options = furrr_options(seed=TRUE))

smri_results <- resp_names %>% 
  future_map(.,~enet_tuning(resp_var =.
                            ,data_list = smri_enet_data[[.]]),
             .options = furrr_options(seed=TRUE))

rsmri_results <- resp_names %>% 
  future_map(.,~enet_tuning(resp_var =.
                            ,data_list = rsmri_enet_data[[.]]),
             .options = furrr_options(seed=TRUE))

enet_results_all <- list("Nback_results"=Nback_results,
                         "SST_results"=SST_results,
                         "MID_results"=MID_results,
                         "rsmri_results"=rsmri_results,
                         "smri_results"=smri_results,
                         "DTI_results"=DTI_results)

2.3.3 getting all the numbers of participants in all the data splits

modality_vec <- c("Nback","SST","MID","rsmri","smri","DTI")
modality_results <- paste0(modality_vec,"_results")

datasplits_number = function(data_list, resp_vec, resp_plotting){
  number_datasplits <- modality_results %>% map(.,function(modality_resp=.){
  data_input <- data_list[[modality_resp]]
  one_modality <- resp_vec %>% map(.,function(resp=.){
  one_allset <- tibble(enet_train = data_input[[resp]][["obs_train"]],
                       stack_train=data_input[[resp]][["obs_valid"]],
                       baseline=dim(data_input[[resp]][["predicted_baseline"]])[1],
                       followup=dim(data_input[[resp]][["predicted_followup"]])[1],
                       response=resp) 
  return(one_allset)
  })
})

names(number_datasplits)<- modality_results

number_datasplits <- number_datasplits %>% map(.,~do.call(rbind,.))

number_datasplits <- number_datasplits %>% map(.,~left_join(.,resp_plotting,by="response"))

modality_results %>% map(.,~ mutate(number_datasplits[[.]],
                                    modality=str_remove_all(.,"_results"))%>%
                           select(modality,
                                  enet_train,
                                  stack_train,
                                  baseline,
                                  followup,
                                  short_name)%>% 
   kableExtra::kbl(caption = paste0("Sample size of all the data sets ")) %>%
   kableExtra::kable_classic(full_width = F, html_font = "Cambria") # seems to have problem knitting
   )
}

datasplits_number(data_list = enet_results_all,resp_vec = resp_names,resp_plotting = resp_var_plotting)
[[1]]
Sample size of all the data sets
modality enet_train stack_train baseline followup short_name
Nback 1417 1385 2734 3183 2-back Work Mem
Nback 1397 1364 2719 3078 Pic Vocab
Nback 1398 1363 2709 3131 Flanker
Nback 1397 1361 2708 3118 Pattern Speed
Nback 1397 1364 2717 3129 Seq Memory
Nback 1392 1364 2712 3073 Reading Recog
Nback 1395 1361 2642 3175 Little Man
Nback 1392 1351 2682 3123 Audi Verbal
Nback 1162 1148 2285 2738 integration SSRT
[[2]]
Sample size of all the data sets
modality enet_train stack_train baseline followup short_name
SST 1403 1428 2755 3001 2-back Work Mem
SST 1447 1458 2854 2987 Pic Vocab
SST 1447 1452 2848 3039 Flanker
SST 1445 1452 2848 3028 Pattern Speed
SST 1447 1456 2850 3040 Seq Memory
SST 1439 1458 2846 2983 Reading Recog
SST 1440 1452 2772 3085 Little Man
SST 1444 1446 2819 3022 Audi Verbal
SST 1469 1480 2872 3088 integration SSRT
[[3]]
Sample size of all the data sets
modality enet_train stack_train baseline followup short_name
MID 1599 1594 3066 3221 2-back Work Mem
MID 1647 1635 3213 3222 Pic Vocab
MID 1646 1628 3210 3281 Flanker
MID 1647 1630 3206 3270 Pattern Speed
MID 1651 1635 3211 3279 Seq Memory
MID 1643 1634 3205 3215 Reading Recog
MID 1641 1631 3132 3326 Little Man
MID 1639 1621 3175 3261 Audi Verbal
MID 1295 1323 2579 2771 integration SSRT
[[4]]
Sample size of all the data sets
modality enet_train stack_train baseline followup short_name
rsmri 1871 1900 3556 4051 2-back Work Mem
rsmri 2041 2056 3862 4270 Pic Vocab
rsmri 2037 2045 3858 4350 Flanker
rsmri 2039 2052 3853 4331 Pattern Speed
rsmri 2043 2054 3860 4347 Seq Memory
rsmri 2041 2054 3846 4258 Reading Recog
rsmri 2034 2054 3756 4417 Little Man
rsmri 2034 2041 3823 4327 Audi Verbal
rsmri 1503 1535 2970 3438 integration SSRT
[[5]]
Sample size of all the data sets
modality enet_train stack_train baseline followup short_name
smri 2133 2178 4028 4380 2-back Work Mem
smri 2467 2484 4622 4724 Pic Vocab
smri 2465 2484 4619 4800 Flanker
smri 2464 2478 4611 4785 Pattern Speed
smri 2467 2485 4619 4805 Seq Memory
smri 2455 2482 4608 4712 Reading Recog
smri 2464 2488 4498 4881 Little Man
smri 2450 2472 4571 4775 Audi Verbal
smri 1661 1706 3292 3624 integration SSRT
[[6]]
Sample size of all the data sets
modality enet_train stack_train baseline followup short_name
DTI 2104 2111 3742 4117 2-back Work Mem
DTI 2324 2312 4159 4369 Pic Vocab
DTI 2321 2313 4154 4439 Flanker
DTI 2322 2308 4151 4428 Pattern Speed
DTI 2326 2312 4155 4439 Seq Memory
DTI 2324 2312 4143 4357 Reading Recog
DTI 2317 2311 4054 4510 Little Man
DTI 2320 2301 4126 4412 Audi Verbal
DTI 1637 1650 3051 3424 integration SSRT

2.3.4 Fit the tuned enet model to enetXplorer for each cognitive task and plot them

enetXplorer allows us to use permutation to test the significance of each feature

### NOT RUN by default, takes long time

enet_results_xplorer <- enet_results_all
names(enet_results_xplorer) <- modality_vec
fitting_enetxplorer <- function(resp_vec , modality_var,data_input,tuning_input){
  data_split=data_input[[modality_var]]
enet_train <- resp_vec %>% map(., ~select(data_split[[.]][["ols_data_train"]],-all_of(.)))  
enet_test <- resp_vec %>% map(., ~select(data_split[[.]][["ols_test_baseline"]],-all_of(.))) 
enet_resp_train <- resp_vec %>% map(., ~select(data_split[[.]][["ols_data_train"]],all_of(.))) 
enet_resp_test <- resp_vec %>% map(., ~select(data_split[[.]][["ols_test_baseline"]],all_of(.)))

doParallel::registerDoParallel(cores = 20)

fit_enet_all_IQR <-resp_vec %>% 
  future_map(.,~eNetXplorer(x = as.matrix(enet_train[[.]]) , 
                            y = as.vector(enet_resp_train[[.]][[.]]), 
                            alpha =tuning_input[[modality_var]][[.]][["param_tune"]][["mixture"]], 
                            n_fold = 10,
                            #nlambda.ext = 10, 
                            #nlambda = 10, #uses the default 100 levels with the data-driven range defined by glmnet  
                            scaled = TRUE, 
                            seed = 123456),
             .options = furrr_options(seed=TRUE)) 
return(fit_enet_all_IQR)
}
##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)

enet_tuning_fit <- modality_vec  %>% 
  map(.,~fitting_enetxplorer(resp_vec = resp_names,
                             modality_var = .,
                             data_input = enet_data_all,
                             tuning_input = enet_results_xplorer))

names(enet_tuning_fit)<- modality_vec

2.3.5 results of enetXplorer for each cognitive task

here we show hyperparameters from enetXplorer.
Alpha is forced to be the same with the tidymodels approach. Lambda via glmnet adaptive range is close to the tidymodels approach. see more on this approach here
https://stats.stackexchange.com/questions/174897/choosing-the-range-and-grid-density-for-regularization-parameter-in-lasso The accuracy is out-of-bag correlations between predicted and observed within the training set. P value is based on the permutation test.

enet_fit_table <- function(modality_input,fit_input, tune_input,resp_input, plotting_names ){
  fit_enet_all_IQR <- fit_input[[modality_input]]
  glmn_tuning_all_IQR <- tune_input[[modality_input]]
  lambdas_all <- vector("list", length = length(resp_input))
  names(lambdas_all)<- resp_input
  lambdas_all_best <- vector("list", length = length(resp_input))
  names(lambdas_all_best)<- resp_input
  summary_enet_all <- vector("list", length = length(resp_input))
  names(summary_enet_all) <- resp_input

for(i in 1:length(resp_input)){
  lambdas_all[[resp_input[i]]] <- fit_enet_all_IQR[[resp_input[i]]][["lambda_values"]]
  lambdas_all_best[[resp_input[i]]] <- fit_enet_all_IQR[[resp_input[i]]][["best_lambda"]]
  summary_enet_all[[resp_input[i]]]<- as_tibble(summary(fit_enet_all_IQR[[resp_input[i]]])[[2]]) %>% slice(1)
}

summary_enet_all_test <- summary_enet_all %>% bind_rows() %>% 
  mutate(respones = plotting_names$short_name)

summary_vec <- colnames(summary_enet_all_test)[-5]

summary_enet_all_test <- summary_vec %>% map(.,function(var_input=.){
  summary_enet_all_test[[paste0(var_input)]] <- round(summary_enet_all_test[[paste0(var_input)]],4)
})
names(summary_enet_all_test)<- summary_vec

summary_enet_all_test <- summary_enet_all_test %>% bind_rows()%>% 
  mutate(respones = plotting_names$short_name)%>%
  rename(.,Alpha =  alpha, `Best-tune lambda` = lambda.max, 
         `Accuracy (Pearson r)` = QF.est, 
         `P-value` = model.vs.null.pval) %>%
  #knitr::kable(caption = paste0("EnetXplorer fitting of ", modality_input))
   kableExtra::kbl(caption = paste0("EnetXplorer fitting of ", modality_input)) %>%
   kableExtra::kable_classic(full_width = F, html_font = "Cambria") 
# seems to have problem knitting
##problem solved by adding results='asis' option in knitting
}

 modality_vec %>% map(.,~enet_fit_table(modality_input=.,
                                        fit_input = enet_tuning_fit, 
                                        tune_input = enet_results_xplorer,
                                        resp_input = resp_names,
                                        plotting_names = resp_var_plotting))
[[1]]
EnetXplorer fitting of Nback
Alpha Best-tune lambda Accuracy (Pearson r) P-value respones
0.0 0.7398 0.4706 0.0004 2-back Work Mem
0.0 0.8318 0.3464 0.0004 Pic Vocab
0.0 1.5030 0.1769 0.0004 Flanker
0.3 0.1277 0.1183 0.0004 Pattern Speed
0.0 4.0530 0.1895 0.0004 Seq Memory
0.0 2.4728 0.2959 0.0004 Reading Recog
0.1 0.1947 0.2426 0.0004 Little Man
0.1 0.2181 0.1645 0.0004 Audi Verbal
0.0 0.8539 0.1153 0.0012 integration SSRT
[[2]]
EnetXplorer fitting of SST
Alpha Best-tune lambda Accuracy (Pearson r) P-value respones
0.5 0.0254 0.1282 0.0008 2-back Work Mem
0.2 0.0484 0.1658 0.0004 Pic Vocab
0.5 0.0476 0.1196 0.0012 Flanker
0.0 0.0593 0.0581 0.0496 Pattern Speed
0.1 0.0018 0.0052 0.4390 Seq Memory
0.0 0.0062 0.0741 0.0216 Reading Recog
0.8 0.0314 0.0936 0.0064 Little Man
0.9 0.0518 0.0549 0.0396 Audi Verbal
0.0 0.5711 0.3137 0.0004 integration SSRT
[[3]]
EnetXplorer fitting of MID
Alpha Best-tune lambda Accuracy (Pearson r) P-value respones
0.0 0.6653 0.1379 0.0004 2-back Work Mem
0.0 0.8611 0.2202 0.0004 Pic Vocab
0.0 2.6097 0.0704 0.0112 Flanker
0.0 2.0368 0.0783 0.0028 Pattern Speed
0.0 2.8319 0.0815 0.0064 Seq Memory
0.1 0.1873 0.2257 0.0004 Reading Recog
0.1 0.1866 0.0916 0.0040 Little Man
0.0 1.7944 0.1472 0.0004 Audi Verbal
1.0 0.0233 0.0760 0.0224 integration SSRT
[[4]]
EnetXplorer fitting of rsmri
Alpha Best-tune lambda Accuracy (Pearson r) P-value respones
0 0.0512 0.1677 0.0004 2-back Work Mem
0 0.5142 0.2210 0.0004 Pic Vocab
0 3.0027 0.0962 0.0004 Flanker
0 0.3233 0.0822 0.0032 Pattern Speed
0 1.1274 0.0997 0.0004 Seq Memory
0 0.0907 0.1959 0.0004 Reading Recog
1 0.0304 0.1415 0.0004 Little Man
0 3.9336 0.0995 0.0008 Audi Verbal
1 0.0351 0.0645 0.0328 integration SSRT
[[5]]
EnetXplorer fitting of smri
Alpha Best-tune lambda Accuracy (Pearson r) P-value respones
0.0 4.8930 0.1455 0.0004 2-back Work Mem
0.2 0.0418 0.2112 0.0004 Pic Vocab
1.0 0.0328 0.0754 0.0060 Flanker
0.0 1.8544 0.0612 0.0100 Pattern Speed
0.0 2.7896 0.1288 0.0004 Seq Memory
0.0 0.9688 0.1869 0.0004 Reading Recog
0.0 2.9783 0.1447 0.0004 Little Man
0.1 0.2536 0.0920 0.0004 Audi Verbal
0.0 4.6013 0.0647 0.0144 integration SSRT
[[6]]
EnetXplorer fitting of DTI
Alpha Best-tune lambda Accuracy (Pearson r) P-value respones
0.0 0.7204 0.1089 0.0004 2-back Work Mem
0.0 0.0129 0.2142 0.0004 Pic Vocab
0.2 0.0295 0.1105 0.0004 Flanker
0.0 0.3508 0.1073 0.0004 Pattern Speed
0.0 0.4257 0.1005 0.0004 Seq Memory
1.0 0.0019 0.1863 0.0004 Reading Recog
0.1 0.0544 0.1337 0.0004 Little Man
0.0 0.0939 0.0991 0.0004 Audi Verbal
0.0 1.5825 -0.0135 0.4242 integration SSRT

2.3.6 function to extract output from enetXplorer

extract_tibble <- function(elastic_mod, alpha_index) {
   variable <- elastic_mod$feature_coef_wmean[, alpha_index] %>% names()
    wmean <- elastic_mod$feature_coef_wmean[, alpha_index]
    wsd <- elastic_mod$feature_coef_wsd[, alpha_index]
    null_wmean <- elastic_mod$null_feature_coef_wmean[, alpha_index]
    null_wsd <- elastic_mod$null_feature_coef_wsd[, alpha_index]
    pvalue <- elastic_mod$feature_coef_model_vs_null_pval[, alpha_index]
    
    tib <- tibble(variable, wmean, wsd, null_wmean, null_wsd, pvalue)
   
    tib <- tib %>%
      gather(key = 'placeholder', value = 'value', wmean, wsd, null_wmean, null_wsd) %>%
      mutate(type = ifelse(str_detect(placeholder, 'null'), 'null', 'target'),
             placeholder = (str_remove(placeholder, 'null_'))) %>%
      mutate(type = factor(type, labels = c('Null', 'Target'))) %>%
      spread(placeholder, value)
    tib
}

2.3.7 Plot results from eNetExplorer with task-fMRI predicting each cognitive task

modality_type_vec <- c("Nback","SST","MID")
str_type_vec <- c("X2backvs0back_ROI_","anystopvscorrectgo_ROI_","antiRewVsNeu_ROI_")

enet_interval_plot <- 
  function(data_input,
           tuning_input,
           modality_input,
           str_input,
           resp_input,
           plotting_names=resp_var_plotting){
    
    fit_enet_all_IQR <- data_input[[modality_input]]
    names(tuning_input)<- modality_vec
    glmn_tuning_all_IQR <- tuning_input[[modality_input]]
    
    coefs_enet_all <- resp_input %>%  
      map(.,
          ~extract_tibble(
            fit_enet_all_IQR[[.]], 
            alpha_index = paste0("a",glmn_tuning_all_IQR[[.]][["param_tune"]][["mixture"]])))
    
    coefs_enet_all <- coefs_enet_all  %>% 
      map(.,~filter(.,pvalue < 0.05) %>%
            mutate(.,type = ifelse(type == 'Null', 
                                   'Null permuted models', 
                                   'Target models'),
                   roi = str_remove(variable, str_input))%>%
            left_join( .,new_shorter_names,by="roi"))
    
    roi_num_enet <- coefs_enet_all %>% map(.,~dim(.)[1])  
    
    max_roi_enet <- max(as.numeric(roi_num_enet))
    coefs_enet_test <- coefs_enet_all[[resp_input[1]]] %>% 
      group_by(type)
    
    coefs_enet_test <- coefs_enet_test%>% nest(-type)
    
    coefs_enet_test[[2]][[1]] <- 
      coefs_enet_test[[2]][[1]] %>% 
      mutate(direction1 = 
               ifelse(coefs_enet_test[[2]][[1]]$wmean >=
                        median(coefs_enet_test[[2]][[1]]$wmean)|roi_num_enet[[resp_input[1]]] <=
                        floor(max_roi_enet/2),"big","small"))
    
    coefs_enet_test[[2]][[2]] <- coefs_enet_test[[2]][[2]] %>% 
      mutate(direction1=coefs_enet_test[[2]][[1]]$direction1)

    coefs_enet_test <- coefs_enet_test %>% unnest()
    
    coefs_enet_all <- coefs_enet_all%>%map(.,~group_by(.,type)) 
    
    coefs_enet_all <- coefs_enet_all%>%map(.,~nest(.,-type)) 

for(i in 1:length(resp_input)){
  coefs_enet_all[[resp_input[i]]][["data"]][[2]]<-
    coefs_enet_all[[resp_input[i]]][["data"]][[2]] %>% 
    mutate(direction = ifelse(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean >= median(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean)|roi_num_enet[[resp_input[i]]] <= floor(max_roi_enet/2),"big","small"))

    coefs_enet_all[[resp_input[i]]][["data"]][[1]] <- 
      coefs_enet_all[[resp_input[i]]][["data"]][[1]] %>%
      mutate(direction=coefs_enet_all[[resp_input[i]]][["data"]][[2]]$direction)
}

coefs_enet_all <- coefs_enet_all %>%map(.,~unnest(.)) 

resp_input %>% map(.,~ggplot(coefs_enet_all[[.]], 
                             aes(x = fct_reorder(roiShort, wmean), 
                                 y = wmean, 
                                 ymax = wmean + 2 * wsd, 
                                 ymin = wmean - 2 * wsd,
                                 col = type)) +
  geom_pointrange(fatten = 0.5, key_glyph = 'point') +
  scale_y_continuous(labels = numform::ff_num(zero = 0, digits = 2)) + 
  scale_color_grey(start = 0.7, end = 0.5) +
  coord_flip() +
  guides(colour = guide_legend(override.aes = list(size = 2.5)))+
  labs(x = 'Explanatory Variables (Brain Regions)', 
       y = 'Averaged Coefficient Across Models (±2 SD)', 
       col = 'Model type',
       title = paste0(modality_input, " task-fMRI Predicting\n",
                      plotting_names$longer_name[[which(plotting_names$response==.)]])) +
  facet_wrap(~ direction, scales = 'free_y') +
  scale_color_manual(values = c("#56B4E9", "black"),labels = c("Permuted Null", "Target")) +     
  theme_bw() +  
#theme(axis.text.y = element_text(angle = 15)) +
theme(legend.title=element_blank()) +  
theme(legend.position = "top") + 
theme(
  axis.title.x = element_text(size = 15),
  axis.text.x = element_text(size = 12),
  axis.title.y = element_text(size = 15),
  axis.text.y = element_text(size = 12),
  legend.text = element_text(size = 15),
  plot.title = element_text(size=15)) +
 theme(
    strip.background = element_blank(),
    strip.text.x = element_blank()
) 
+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
)

  } 

map2(.x=modality_type_vec,
     .y=str_type_vec,
     ~enet_interval_plot(data_input=enet_tuning_fit,
                         tuning_input=enet_results_all,
                         resp_input = resp_names,
                         modality_input = .x,
                         str_input = .y,
                         plotting_names=resp_var_plotting))
## [[1]]
## [[1]]$TFMRI_NB_ALL_BEH_C2B_RATE

## 
## [[1]]$NIHTBX_PICVOCAB_UNCORRECTED

## 
## [[1]]$NIHTBX_FLANKER_UNCORRECTED

## 
## [[1]]$NIHTBX_PATTERN_UNCORRECTED

## 
## [[1]]$NIHTBX_PICTURE_UNCORRECTED

## 
## [[1]]$NIHTBX_READING_UNCORRECTED

## 
## [[1]]$LMT_SCR_PERC_CORRECT

## 
## [[1]]$PEA_RAVLT_LD_TRIAL_VII_TC

## 
## [[1]]$TFMRI_SST_ALL_BEH_TOTAL_ISSRT

## 
## 
## [[2]]
## [[2]]$TFMRI_NB_ALL_BEH_C2B_RATE

## 
## [[2]]$NIHTBX_PICVOCAB_UNCORRECTED

## 
## [[2]]$NIHTBX_FLANKER_UNCORRECTED

## 
## [[2]]$NIHTBX_PATTERN_UNCORRECTED

## 
## [[2]]$NIHTBX_PICTURE_UNCORRECTED

## 
## [[2]]$NIHTBX_READING_UNCORRECTED

## 
## [[2]]$LMT_SCR_PERC_CORRECT

## 
## [[2]]$PEA_RAVLT_LD_TRIAL_VII_TC

## 
## [[2]]$TFMRI_SST_ALL_BEH_TOTAL_ISSRT

## 
## 
## [[3]]
## [[3]]$TFMRI_NB_ALL_BEH_C2B_RATE

## 
## [[3]]$NIHTBX_PICVOCAB_UNCORRECTED

## 
## [[3]]$NIHTBX_FLANKER_UNCORRECTED

## 
## [[3]]$NIHTBX_PATTERN_UNCORRECTED

## 
## [[3]]$NIHTBX_PICTURE_UNCORRECTED

## 
## [[3]]$NIHTBX_READING_UNCORRECTED

## 
## [[3]]$LMT_SCR_PERC_CORRECT

## 
## [[3]]$PEA_RAVLT_LD_TRIAL_VII_TC

## 
## [[3]]$TFMRI_SST_ALL_BEH_TOTAL_ISSRT

2.3.8 Plot results from eNetExplorer with sMRI predicting each cognitive task

enet_interval_plot_smri <- function(data_input,tuning_input,resp_input,plotting_names=resp_var_plotting){
    fit_enet_all_IQR <- data_input[["smri"]]
    names(tuning_input)<- modality_vec
    glmn_tuning_all_IQR <- tuning_input[["smri"]]
    
      coefs_enet_all <- resp_input %>%  map(.,~extract_tibble(fit_enet_all_IQR[[.]], alpha_index = paste0("a",glmn_tuning_all_IQR[[.]][["param_tune"]][["mixture"]])))
      roi_vec <- coefs_enet_all[[1]][["variable"]]
      roi_vec <- gsub("ASEG_","",roi_vec)
      roi_vec <- gsub("Dest_Thick_","",roi_vec)
      roi_frame <- tibble(variable=coefs_enet_all[[1]][["variable"]],roi=roi_vec)
    coefs_enet_all <- coefs_enet_all  %>% map(.,~left_join(.,roi_frame,by="variable")%>%filter(.,pvalue < 0.05) %>%
  mutate(.,type = ifelse(type == 'Null', 'Null permuted models', 'Target models'))%>%
    left_join( .,new_shorter_names,by="roi"))
  
roi_num_enet <- coefs_enet_all %>% map(.,~dim(.)[1])  

max_roi_enet <- max(as.numeric(roi_num_enet))
coefs_enet_test <- coefs_enet_all[[resp_input[1]]] %>% 
  group_by(type)
coefs_enet_test <- coefs_enet_test%>% nest(-type)
 coefs_enet_test[[2]][[1]] <- coefs_enet_test[[2]][[1]] %>% 
   mutate(direction1 = ifelse(coefs_enet_test[[2]][[1]]$wmean >= median(coefs_enet_test[[2]][[1]]$wmean)|roi_num_enet[[resp_input[1]]] <= floor(max_roi_enet/2),"big","small"))
coefs_enet_test[[2]][[2]] <- coefs_enet_test[[2]][[2]] %>% mutate(direction1=coefs_enet_test[[2]][[1]]$direction1)
coefs_enet_test <- coefs_enet_test %>% unnest()

coefs_enet_all <- coefs_enet_all%>%map(.,~group_by(.,type)) 
coefs_enet_all <- coefs_enet_all%>%map(.,~nest(.,-type)) 
for(i in 1:length(resp_input)){
  coefs_enet_all[[resp_input[i]]][["data"]][[2]]<-coefs_enet_all[[resp_input[i]]][["data"]][[2]] %>% mutate(direction = ifelse(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean >= median(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean)|roi_num_enet[[resp_input[i]]] <= floor(max_roi_enet/2),"big","small"))
  coefs_enet_all[[resp_input[i]]][["data"]][[1]] <- coefs_enet_all[[resp_input[i]]][["data"]][[1]] %>% mutate(direction=coefs_enet_all[[resp_input[i]]][["data"]][[2]]$direction)
}

coefs_enet_all <- coefs_enet_all %>%map(.,~unnest(.)) 

resp_input %>% map(.,~ggplot(coefs_enet_all[[.]], aes(x = fct_reorder(roiShort, wmean), 
             y = wmean, 
             ymax = wmean + 2 * wsd, ymin = wmean - 2 * wsd,
             col = type)) +
  geom_pointrange(fatten = 0.5, key_glyph = 'point') +
  scale_y_continuous(labels = numform::ff_num(zero = 0, digits = 2)) + 
  scale_color_grey(start = 0.7, end = 0.5) +
  coord_flip() +
  guides(colour = guide_legend(override.aes = list(size = 2.5)))+
  labs(x = 'Explanatory Variables (Brain Regions)', 
       y = 'Averaged Coefficient Across Models (±2 SD)', 
       col = 'Model type',
       title = paste0("sMRI Predicting\n",plotting_names$longer_name[[which(plotting_names$response==.)]])) +
  facet_wrap(~ direction, scales = 'free_y') +
  scale_color_manual(values = c("#56B4E9", "black"),labels = c("Permuted Null", "Target")) +     
  theme_bw() +  
#theme(axis.text.y = element_text(angle = 15)) +
theme(legend.title=element_blank()) +  
theme(legend.position = "top") + 
theme(
  axis.title.x = element_text(size = 15),
  axis.text.x = element_text(size = 12),
  axis.title.y = element_text(size = 15),
  axis.text.y = element_text(size = 12),
  legend.text = element_text(size = 15),
  plot.title = element_text(size=15)) +
 theme(
    strip.background = element_blank(),
    strip.text.x = element_blank()
) 
+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
)

  } 



enet_interval_plot_smri(data_input=enet_tuning_fit,tuning_input=enet_results_all,resp_input = resp_names,plotting_names=resp_var_plotting)
## $TFMRI_NB_ALL_BEH_C2B_RATE

## 
## $NIHTBX_PICVOCAB_UNCORRECTED

## 
## $NIHTBX_FLANKER_UNCORRECTED

## 
## $NIHTBX_PATTERN_UNCORRECTED

## 
## $NIHTBX_PICTURE_UNCORRECTED

## 
## $NIHTBX_READING_UNCORRECTED

## 
## $LMT_SCR_PERC_CORRECT

## 
## $PEA_RAVLT_LD_TRIAL_VII_TC

## 
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT

2.3.9 Plot results from eNetExplorer with rs-fMRI predicting each cognitive task

roi_names <- colnames(select(rsmri_par_data, starts_with("parAuditory")))%>% str_remove_all(.,"parAuditory")
roi_names_all <- c("Auditory",roi_names)
par_roi_names_all <- paste0("par", roi_names_all)


matrix_prepare <- function(data_list){
  heat_matrix <- matrix(rep(0,13*13),nrow = 13)
  for(i in 1:dim(data_list)[1]){
  if(str_detect(data_list$roi[i],paste0("^","par"))==TRUE){
coor_matrix_1 <- which(as.vector(par_roi_names_all %>% map(.,~str_detect(data_list$roi[i],paste0("\\A",.)))%>%do.call(rbind,.)) ==TRUE) 
coor_matrix_2 <- which(as.vector(roi_names_all %>% map(.,~str_detect(data_list$roi[i],paste0(.,"$")))%>%do.call(rbind,.)) ==TRUE) 
heat_matrix[coor_matrix_1,coor_matrix_2]<- heat_matrix[coor_matrix_2,coor_matrix_1]<- data_list$estimate[i]
}
if(str_detect(data_list$roi[i],paste0("^","Within"))==TRUE){
  coor_matrix <- which(as.vector(roi_names_all %>% map(.,~str_detect(data_list$roi[i],paste0("Within",.)))%>%do.call(rbind,.)) ==TRUE) 
  heat_matrix[coor_matrix,coor_matrix]<- data_list$estimate[i]
}
  }
  heat_matrix[lower.tri(heat_matrix)]<- NA
return(heat_matrix)
  }


enet_interval_plot_rsmri <- function(data_input,tuning_input,resp_input,plotting_names=resp_var_plotting){
    fit_ridge_all_RS_par <- data_input[["rsmri"]]
    names(tuning_input)<- modality_vec
    glmn_tuning_all_RS_par <- tuning_input[["rsmri"]]
    
      coefs_ridge_all <- resp_input %>%  map(.,~extract_tibble(fit_ridge_all_RS_par[[.]], alpha_index = paste0("a",glmn_tuning_all_RS_par[[.]][["param_tune"]][["mixture"]])))
    coefs_ridge_all <- coefs_ridge_all  %>% map(.,~filter(.,pvalue < 0.05) %>%
  mutate(.,type = ifelse(type == 'Null', 'Null permuted models', 'Target models'),roi=variable,roi_short= str_remove_all(variable,"^par")))
roi_num_ridge <- coefs_ridge_all %>% map(.,~dim(.)[1])
max_roi_ridge <- max(as.numeric(roi_num_ridge))

coefs_ridge_test <- coefs_ridge_all[[resp_input[1]]] %>% group_by(type)
coefs_ridge_test <- coefs_ridge_test%>% nest(-type)
 coefs_ridge_test[[2]][[1]] <- coefs_ridge_test[[2]][[1]] %>% mutate(direction1 = ifelse(coefs_ridge_test[[2]][[1]]$wmean >= median(coefs_ridge_test[[2]][[1]]$wmean)|roi_num_ridge[[resp_input[1]]] <= floor(max_roi_ridge/2),"big","small"))
coefs_ridge_test[[2]][[2]] <- coefs_ridge_test[[2]][[2]] %>% mutate(direction1=coefs_ridge_test[[2]][[1]]$direction1)
coefs_ridge_test <- coefs_ridge_test %>% unnest()

coefs_ridge_all <- coefs_ridge_all%>%map(.,~group_by(.,type)) 
coefs_ridge_all <- coefs_ridge_all%>%map(.,~nest(.,-type)) 
for(i in 1:length(resp_input)){
  coefs_ridge_all[[resp_input[i]]][["data"]][[2]]<-coefs_ridge_all[[resp_input[i]]][["data"]][[2]] %>% mutate(direction = ifelse(coefs_ridge_all[[resp_input[i]]][["data"]][[2]]$wmean >= median(coefs_ridge_all[[resp_input[i]]][["data"]][[2]]$wmean)|roi_num_ridge[[resp_input[i]]] <= floor(max_roi_ridge/2),"big","small"))
  coefs_ridge_all[[resp_input[i]]][["data"]][[1]] <- coefs_ridge_all[[resp_input[i]]][["data"]][[1]] %>% mutate(direction=coefs_ridge_all[[resp_input[i]]][["data"]][[2]]$direction)
}
coefs_ridge_all <- coefs_ridge_all %>%map(.,~unnest(.))

matrix_ridge<- coefs_ridge_all %>% map(.,~filter(., type=="Target models")%>% mutate(.,estimate=wmean)%>%matrix_prepare(.))

for(i in 1:length(resp_input)){
  colnames(matrix_ridge[[resp_input[i]]]) <- roi_names_all
  rownames(matrix_ridge[[resp_input[i]]])<- roi_names_all
}
matrix_ridge_test <- matrix_ridge%>% map(.,~as.data.frame(.)%>%mutate(roi = roi_names_all)%>%reshape2::melt(.,id.vars="roi")) 

method_vec <- c("Enet")
matrix_list <- list(Enet = matrix_ridge_test)
matrix_list_name <- method_vec %>% map(.,function(method_used=.){
  matrix_list_name <- resp_input%>%map(.,~mutate(matrix_list[[method_used]][[.]], method=method_used))
 # matrix_list_name <- resp_input %>% map(.,~range(matrix_list_name[[method_used]][[.]]$value,na.rm=TRUE)fileDate = '31Dec2020') 
  return(matrix_list_name)
})
 names(matrix_list_name)=method_vec
 legend_all <- method_vec %>% map(.,function(method_used=.){
    legend_all <- resp_input %>% map(.,~range(matrix_list_name[[method_used]][[.]]$value,na.rm=TRUE)) 
  return(legend_all)
 })
 names(legend_all)<- method_vec
legend_all_test <- t(as.matrix (as.data.frame(legend_all))) 
legend_test <- tibble("low"=legend_all_test[,1],"upp"=legend_all_test[,2],"resp"=resp_input, "method"=c(rep("Elastic Net",length(resp_input))))

legend_resp <- vector("list", length = length(resp_input))
names(legend_resp)<- resp_input
for(i in 1:length(resp_input)){
  legend_resp[[resp_input[i]]]<- tibble("upp"= max(legend_test$upp[which(legend_test$resp==resp_input[i])]),
                                               "low"=min(legend_test$low[which(legend_test$resp==resp_input[i])])) 
}

resp_input%>% map(.,~ggplot(data =matrix_ridge_test[[.]], aes(x=variable, y=roi, fill=value)) +
 geom_tile(color = "white")+
   labs(x=NULL,y=NULL,title=paste0("rs-fMRI Predicting\n", plotting_names$longer_name[[which(plotting_names$response==.)]]))+
 scale_fill_gradient2(low = muted("blue") , high =muted("red"), mid = "grey80", 
   midpoint = 0,na.value = "grey50", limit = c(legend_resp[[.]]$low, legend_resp[[.]]$upp), space = "Lab", 
   name="Parameter\nEstimation") +
  theme_minimal()+ 
 theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1),
      axis.text.y = element_text(angle = 45, vjust = 1, size = 12, hjust = 1),
     #axis.text.x = element_blank(),
     #axis.text.y = element_blank(),
    plot.title = element_text(size=17))+
 coord_fixed())

  } 

enet_interval_plot_rsmri(data_input=enet_tuning_fit,tuning_input=enet_results_all,resp_input = resp_names,plotting_names=resp_var_plotting)
## $TFMRI_NB_ALL_BEH_C2B_RATE

## 
## $NIHTBX_PICVOCAB_UNCORRECTED

## 
## $NIHTBX_FLANKER_UNCORRECTED

## 
## $NIHTBX_PATTERN_UNCORRECTED

## 
## $NIHTBX_PICTURE_UNCORRECTED

## 
## $NIHTBX_READING_UNCORRECTED

## 
## $LMT_SCR_PERC_CORRECT

## 
## $PEA_RAVLT_LD_TRIAL_VII_TC

## 
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT

2.3.10 Plot results from eNetExplorer with DTI predicting each cognitive task

enet_interval_plot_DTI <- function(data_input, tuning_input,resp_input,plotting_names=resp_var_plotting){
    fit_enet_all_IQR <- data_input[["DTI"]]
    names(tuning_input)<- modality_vec
    glmn_tuning_all_IQR <- tuning_input[["DTI"]]
    
      coefs_enet_all <- resp_input %>%  map(.,~extract_tibble(fit_enet_all_IQR[[.]], alpha_index = paste0("a",glmn_tuning_all_IQR[[.]][["param_tune"]][["mixture"]])))
    coefs_enet_all <- coefs_enet_all  %>% map(.,~filter(.,pvalue < 0.05) %>%
  mutate(.,type = ifelse(type == 'Null', 'Null permuted models', 'Target models'),
         roi = str_remove(variable, "FA_")))
roi_num_enet <- coefs_enet_all %>% map(.,~dim(.)[1])  

max_roi_enet <- max(as.numeric(roi_num_enet))
coefs_enet_test <- coefs_enet_all[[resp_input[1]]] %>% group_by(type)
coefs_enet_test <- coefs_enet_test%>% nest(-type)
 coefs_enet_test[[2]][[1]] <- coefs_enet_test[[2]][[1]] %>% mutate(direction1 = ifelse(coefs_enet_test[[2]][[1]]$wmean >= median(coefs_enet_test[[2]][[1]]$wmean)|roi_num_enet[[resp_input[1]]] <= floor(max_roi_enet/2),"big","small"))
coefs_enet_test[[2]][[2]] <- coefs_enet_test[[2]][[2]] %>% mutate(direction1=coefs_enet_test[[2]][[1]]$direction1)
coefs_enet_test <- coefs_enet_test %>% unnest()

coefs_enet_all <- coefs_enet_all%>%map(.,~group_by(.,type)) 
coefs_enet_all <- coefs_enet_all%>%map(.,~nest(.,-type)) 
for(i in 1:length(resp_input)){
  coefs_enet_all[[resp_input[i]]][["data"]][[2]]<-coefs_enet_all[[resp_input[i]]][["data"]][[2]] %>% mutate(direction = ifelse(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean >= median(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean)|roi_num_enet[[resp_input[i]]] <= floor(max_roi_enet/2),"big","small"))
  coefs_enet_all[[resp_input[i]]][["data"]][[1]] <- coefs_enet_all[[resp_input[i]]][["data"]][[1]] %>% mutate(direction=coefs_enet_all[[resp_input[i]]][["data"]][[2]]$direction)
}

coefs_enet_all <- coefs_enet_all %>%map(.,~unnest(.)) 

resp_input %>% map(.,~ggplot(coefs_enet_all[[.]], aes(x = fct_reorder(roi, wmean), 
             y = wmean, 
             ymax = wmean + 2 * wsd, ymin = wmean - 2 * wsd,
             col = type)) +
  geom_pointrange(fatten = 0.5, key_glyph = 'point') +
  scale_y_continuous(labels = numform::ff_num(zero = 0, digits = 2)) + 
  scale_color_grey(start = 0.7, end = 0.5) +
  coord_flip() +
  guides(colour = guide_legend(override.aes = list(size = 2.5)))+
  labs(x = 'Explanatory Variables (Brain Regions)', y = 'Averaged Coefficient Across Models (±2 SD)', col = 'Model type',
       title = paste0("DTI Predicting\n", plotting_names$longer_name[[which(plotting_names$response==.)]])) +
  #facet_wrap(~ direction, scales = 'free_y') +
  scale_color_manual(values = c("#56B4E9", "black"),labels = c("Permuted Null", "Target")) +     
  theme_bw() +  
#theme(axis.text.y = element_text(angle = 15)) +
theme(legend.title=element_blank()) +  
theme(legend.position = "top") + 
theme(
  axis.title.x = element_text(size = 15),
  axis.text.x = element_text(size = 12),
  axis.title.y = element_text(size = 15),
  axis.text.y = element_text(size = 12),
  legend.text = element_text(size = 15),
  plot.title = element_text(size=15)) +
 theme(
    strip.background = element_blank(),
    strip.text.x = element_blank()
) 
+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
)

  } 


enet_interval_plot_DTI(data_input=enet_tuning_fit,
                       tuning_input=enet_results_all,
                       resp_input = resp_names,
                       plotting_names=resp_var_plotting)
## $TFMRI_NB_ALL_BEH_C2B_RATE

## 
## $NIHTBX_PICVOCAB_UNCORRECTED

## 
## $NIHTBX_FLANKER_UNCORRECTED

## 
## $NIHTBX_PATTERN_UNCORRECTED

## 
## $NIHTBX_PICTURE_UNCORRECTED

## 
## $NIHTBX_READING_UNCORRECTED

## 
## $LMT_SCR_PERC_CORRECT

## 
## $PEA_RAVLT_LD_TRIAL_VII_TC

## 
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT

2.4 Combine the predicted values with Random Forest Oppornistic Stacking

2.4.1 Random Forest Data preparation for each cognitive task

Her we replace NA from each modality with unlikely small and large values, resulting in 2 predictors for each modality

## the following function just process the data to put into random forest model  
  rnd_for_data_prep = function(pred_select = "predicted",
                               data_list,
                               resp_chose,
                               resp_data=assessment(baseline_two_fold$splits[[1]]))
    {
    
      enet_pred <- plyr::join_all(list( 
        data_list[["MID_results"]][[resp_chose]][[pred_select]]%>% 
          dplyr::rename("MID_pred"="pred"),
        data_list[["smri_results"]][[resp_chose]][[pred_select]]%>% 
          dplyr::rename("smri_pred"="pred"),
        data_list[["SST_results"]][[resp_chose]][[pred_select]]%>% 
          dplyr::rename("SST_pred"="pred"),
        data_list[["rsmri_results"]][[resp_chose]][[pred_select]]%>% 
          dplyr::rename("rsmri_pred"="pred"),
        data_list[["Nback_results"]][[resp_chose]][[pred_select]]%>% 
          dplyr::rename("Nback_pred"="pred"),
        data_list[["DTI_results"]][[resp_chose]][[pred_select]]%>% 
          dplyr::rename("DTI_pred"="pred")), 
        by='SUBJECTKEY', type='full') %>% 
  distinct(SUBJECTKEY, .keep_all = TRUE)   
      
      enet_pred_p <- enet_pred%>% mutate_all(~replace(., is.na(.), 1000))
      enet_pred_n <- enet_pred%>% mutate_all(~replace(., is.na(.), -1000))
      
      enet_pred_noNA <- left_join(enet_pred_p,enet_pred_n,by="SUBJECTKEY")
      resp_data <- select(resp_data,"SUBJECTKEY",all_of(resp_chose)) %>% 
        distinct(SUBJECTKEY, .keep_all = TRUE) %>%   
        drop_na() %>% 
        mutate_if(is.double, ~ (.x - mean(.x)) / sd(.x))
      
      enet_pred_noNA <- left_join(enet_pred_noNA,resp_data,by="SUBJECTKEY")
      enet_pred_noNA_noID <- select(enet_pred_noNA,-SUBJECTKEY)
      
      return(list("ID"=enet_pred_noNA,"NoID"=enet_pred_noNA_noID))
  }

enet_rf_valid_data <- resp_names %>% 
  map(.,~rnd_for_data_prep(data_list = enet_results_all, 
                           resp_chose = all_of(.)))

enet_rf_test_baseline <- resp_names %>% 
  map(.,~rnd_for_data_prep(pred_select="predicted_baseline", 
                           data_list = enet_results_all, 
                           resp_chose = all_of(.),
                           resp_data = baseline_test))

enet_rf_test_followup <- resp_names %>% 
  map(.,~rnd_for_data_prep(pred_select="predicted_followup", 
                           data_list = enet_results_all, 
                           resp_chose = all_of(.),
                           resp_data = followup_analysis))


stack_datasize_extract <- function(resp_input, data_input){
  one_allset <- data.frame(datasplit_number =dim(data_input[[resp_input]][["ID"]])[1],
                       response=resp_input)
  return(one_allset)}

task_stack_valid <- resp_names %>% 
  map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_valid_data))%>%
  do.call(rbind,.)%>%
  mutate(number_valid = datasplit_number)%>%
  subset(.,select=-c(datasplit_number))

task_stack_baseline <- resp_names %>% 
  map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_test_baseline))%>%
  do.call(rbind,.)%>%
  mutate(baseline_test = datasplit_number)%>%
  subset(.,select=-c(datasplit_number))


task_stack_followup <- resp_names %>% 
  map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_test_followup))%>%
  do.call(rbind,.)%>%
  mutate(followup_test = datasplit_number)%>%
  subset(.,select=-c(datasplit_number))

plyr::join_all(list(task_stack_valid,task_stack_baseline,task_stack_followup,resp_var_plotting), 
                 type="left", by = "response")%>%
  select("short_name","number_valid",ends_with("test"))%>%
  kableExtra::kbl(caption = paste0("Sample size of all the data sets in stacked models ")) %>%
   kableExtra::kable_classic(full_width = F, html_font = "Cambria")
Sample size of all the data sets in stacked models
short_name number_valid baseline_test followup_test
2-back Work Mem 2480 4597 4902
Pic Vocab 2812 5240 5282
Flanker 2810 5235 5372
Pattern Speed 2804 5229 5351
Seq Memory 2809 5235 5368
Reading Recog 2809 5220 5269
Little Man 2808 5093 5458
Audi Verbal 2793 5182 5350
integration SSRT 1936 3725 4053

2.4.2 random forest tuning function

here we specified the 2nd layer model

random_forest_tuning <- function(resp,valid_list,baseline_test, followup_test){
rf_train_data <- valid_list[[resp]]
rf_test_baseline <- baseline_test[[resp]]
rf_test_followup <- followup_test[[resp]]
### to avoid repetition in coding, data frame lists are created with ID and without ID.

resp_var <- resp
##data recipe, assign "SUBJECTKEY" to ID (a class of variables neither belong to predictors nor response variable)
## do not scale the input data as 1000 and -1000 are put there on purpose
forest_rec <- recipe(as.formula(paste0(resp_var, "~ .")), 
                     data = rf_train_data[["NoID"]]) %>%
              step_dummy(all_nominal()) %>% 
              prep(training = rf_train_data[["NoID"]], retain = TRUE)
## mtry is the number of predictors to sample at each split
## min_n (the number of observations needed to keep splitting nodes)
tune_spec <-rand_forest(mtry = tune(),
                         trees = 1000,
                         min_n = tune()) %>%
            set_mode("regression") %>%
            set_engine("ranger")

tune_wf <- workflow() %>%
  add_recipe(forest_rec) %>%
  add_model(tune_spec)

set.seed(123456)

## nested data validation 
all_rs <- rsample::vfold_cv(data=rf_train_data[["ID"]],v = 10,repeats = 1)
## automate generate grid for hyperparameters
#doParallel::registerDoParallel()


rf_grid <- grid_regular(
  mtry(range = c(1, 12)), #12 is the highest number of predictors 
  min_n(range = c(1, 1000)),
  levels = c(mtry = 12, min_n = 101) #mtry = 10 so that it increases one step at a time 
)

tune_res <- tune_grid(
  tune_wf,
  resamples = all_rs,
  metrics = metric_set(rmse),
  grid = rf_grid
)

best_tune <- select_best(tune_res, metric = "rmse")

final_rf <- finalize_model(
  tune_spec,
  best_tune
)

forest_prep <- prep(forest_rec)

## this one fit the random forest model with the best tuned parameters

rf_fit <- final_rf %>%
  set_engine("ranger",importance="permutation") %>% 
  parsnip::fit(as.formula(paste0(resp_var, "~ .")),
  data = juice(forest_prep) 
  )

pred_baseline <- predict(rf_fit,new_data = rf_test_baseline[["NoID"]])
pred_followup <- predict(rf_fit,new_data = rf_test_followup[["NoID"]])
performance_baseline <- cor(pred_baseline$.pred,rf_test_baseline[["ID"]][[resp]])
performance_followup <- cor(pred_followup$.pred,rf_test_followup[["ID"]][[resp]])
best_tune <- best_tune %>% mutate(performance_baseline=performance_baseline,
                                  performance_followup=performance_followup)
result_list <- list("param_tune"=best_tune, 
                    "predicted_baseline"=tibble(SUBJECTKEY=rf_test_baseline[["ID"]]$SUBJECTKEY, 
                                                "pred_baseline"=pred_baseline$.pred),
                    "predicted_followup"=tibble(SUBJECTKEY=rf_test_followup[["ID"]]$SUBJECTKEY,
                                                "pred_followup"=pred_followup$.pred),
                    "model_fit"=rf_fit)

return(result_list)
}

2.4.3 random forest model fit for each cognitive task

Here we actually put all of the fitted results into the random forest. It is time-consuming and computing-expensive to run, so not run by default.

##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)

rf_fit_all <- resp_names %>% 
  future_map(.,~random_forest_tuning(resp=.,
                                     valid_list = enet_rf_valid_data,
                                     baseline_test = enet_rf_test_baseline,
                                     followup_test = enet_rf_test_followup),
             .options = furrr_options(seed=TRUE),.progress = TRUE)

2.4.5 Plot non-conditional permutated variable importance of the random forest model

the results should be treated with caution as the opportunistic stacking includes inherently colineared predictors.

library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
resp_names%>% map(.,~rf_fit_all[[.]][["model_fit"]] %>%
  vip(geom = "point"))  
## $TFMRI_NB_ALL_BEH_C2B_RATE

## 
## $NIHTBX_PICVOCAB_UNCORRECTED

## 
## $NIHTBX_FLANKER_UNCORRECTED

## 
## $NIHTBX_PATTERN_UNCORRECTED

## 
## $NIHTBX_PICTURE_UNCORRECTED

## 
## $NIHTBX_READING_UNCORRECTED

## 
## $LMT_SCR_PERC_CORRECT

## 
## $PEA_RAVLT_LD_TRIAL_VII_TC

## 
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT

2.4.6 Plot conditional permutated variable importance of the random forest stacking model

enet_rf_valid_data_permu <- enet_rf_valid_data %>% map(.,function(data_input=.){
 data_output<- data_input[["NoID"]] %>%
   dplyr::rename("MID_large"="MID_pred.x",
                 "SST_large"="SST_pred.x",
                 "smri_large"="smri_pred.x",
                 "rsmri_large"="rsmri_pred.x",
                 "DTI_large"="DTI_pred.x",
                 "Nback_large"="Nback_pred.x",
                 "MID_small"="MID_pred.y",
                 "SST_small"="SST_pred.y",
                 "smri_small"="smri_pred.y",
                 "rsmri_small"="rsmri_pred.y",
                 "DTI_small"="DTI_pred.y",
                 "Nback_small"="Nback_pred.y")
 return(data_output)
  })


cf_fit <- resp_names %>% map(.,function(resp_var=.){
  cforest(as.formula(paste0(resp_var, "~ .")),
          data=enet_rf_valid_data_permu[[resp_var]],
          control= cforest_unbiased(mtry=rf_fit_all[[resp_var]][["param_tune"]]$mtry,
                                    ntree=1000,minsplit=rf_fit_all[[resp_var]][["param_tune"]]$min_n))
}) 

PI_permimp <-cf_fit %>% map(.,~permimp(.,progressBar = FALSE)) 

resp_names %>% map(.,~plot(PI_permimp[[.]], 
                           type = "box", 
                           horizontal = TRUE,
                           main=paste0("Conditional Permutation Importance ",
                                       resp_var_plotting$short_name[[which(resp_var_plotting$response==.)]])))

## $TFMRI_NB_ALL_BEH_C2B_RATE
## NULL
## 
## $NIHTBX_PICVOCAB_UNCORRECTED
## NULL
## 
## $NIHTBX_FLANKER_UNCORRECTED
## NULL
## 
## $NIHTBX_PATTERN_UNCORRECTED
## NULL
## 
## $NIHTBX_PICTURE_UNCORRECTED
## NULL
## 
## $NIHTBX_READING_UNCORRECTED
## NULL
## 
## $LMT_SCR_PERC_CORRECT
## NULL
## 
## $PEA_RAVLT_LD_TRIAL_VII_TC
## NULL
## 
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT
## NULL
resp_names %>% map(.,~plot(PI_permimp[[.]], 
                           type = "bar", 
                           interval="quantile",
                           main=paste0("Conditional Permutation Importance ",
                                       resp_var_plotting$short_name[[which(resp_var_plotting$response==.)]])))

## $TFMRI_NB_ALL_BEH_C2B_RATE
## NULL
## 
## $NIHTBX_PICVOCAB_UNCORRECTED
## NULL
## 
## $NIHTBX_FLANKER_UNCORRECTED
## NULL
## 
## $NIHTBX_PATTERN_UNCORRECTED
## NULL
## 
## $NIHTBX_PICTURE_UNCORRECTED
## NULL
## 
## $NIHTBX_READING_UNCORRECTED
## NULL
## 
## $LMT_SCR_PERC_CORRECT
## NULL
## 
## $PEA_RAVLT_LD_TRIAL_VII_TC
## NULL
## 
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT
## NULL

2.4.6.1 Plot obverved against predicted value for the Random Forest Stacking model for each task

random_forest_plotting_followup <- function(rf_fit, followup_data, resp_vec, naming_frame ){
  followup_rf_pred <- resp_vec %>% map(.,function(resp_var=.){
  resp_pred <- tibble(pred = rf_fit[[resp_var]][["predicted_followup"]][["pred_followup"]],
                      observed=followup_data[[resp_var]][["ID"]][[resp_var]],
                      SUBJECTKEY=followup_data[[resp_var]][["ID"]][["SUBJECTKEY"]])
  return(resp_pred)
})

task_followup_pred <- resp_vec %>% 
  map(., ~ggplot(followup_rf_pred[[.]],
                 aes(x = scale(pred) , 
                     y = scale(observed))) +
  geom_jitter(height = 0.1, width = 0.1, size = 1, col = 'grey60') +
  geom_smooth(method = 'lm', se = FALSE, col = 'black')  +
  labs(x = NULL,
       y = NULL,
       title = paste (naming_frame$short_name[[which(naming_frame$response==.)]],
                      '\nr = ',round(rf_fit[[.]][["param_tune"]][["performance_followup"]], 3))) 
                     )
  title_task_rf_pred_plot <- ggdraw() + 
  draw_label(
    "Out-of-Sample Random Forest Stacking Predictive Ability at Followup",
    fontface = 'bold',
    x = 0,
    hjust = 0
  ) +
  theme(
    # add margin on the left of the drawing canvas,
    # so title is aligned with left edge of first plot
    plot.margin = margin(0, 0, 0, 7)
  )
  followup_pred_all_figure<- plot_grid(title_task_rf_pred_plot,plot_grid(plotlist = task_followup_pred),nrow = 2 , rel_heights = c(0.1, 1))

  ggpubr::annotate_figure(followup_pred_all_figure,left= ggpubr::text_grob("Observed Cognitive Performance (Z)",size=15,rot=90),
                        bottom = ggpubr::text_grob("Predicted Cognitive Performance (Z)",size=15))
}

random_forest_plotting_baseline <- function(rf_fit, baseline_data, resp_vec,naming_frame ){
 baseline_rf_pred <- resp_vec %>% map(.,function(resp_var=.){
  resp_pred <- tibble(pred = rf_fit[[resp_var]][["predicted_baseline"]][["pred_baseline"]],
                                 observed=baseline_data[[resp_var]][["ID"]][[resp_var]],
                                 SUBJECTKEY=baseline_data[[resp_var]][["ID"]][["SUBJECTKEY"]])
  return(resp_pred)
})

task_baseline_pred <- resp_vec %>% map(., ~ggplot(baseline_rf_pred[[.]],aes(x = scale(pred) , y = scale(observed))) +
  geom_jitter(height = 0.1, width = 0.1, size = 1, col = 'grey60') +
  geom_smooth(method = 'lm', se = FALSE, col = 'black')  +
  labs(x = NULL,
       y = NULL,
       title = paste (naming_frame$short_name[[which(naming_frame$response==.)]],'\nr = ',round(rf_fit[[.]][["param_tune"]][["performance_baseline"]], 3))) 
                     )
  title_task_rf_pred_plot_baseline <- ggdraw() + 
  draw_label(
    "Out-of-Sample Random Forest Stacking Predictive Ability at Baseline",
    fontface = 'bold',
    x = 0,
    hjust = 0
  ) +
  theme(
    # add margin on the left of the drawing canvas,
    # so title is aligned with left edge of first plot
    plot.margin = margin(0, 0, 0, 7)
  )
  baseline_pred_all_figure<- plot_grid(title_task_rf_pred_plot_baseline,
                                       plot_grid(plotlist = task_baseline_pred),
                                       nrow = 2 , 
                                       rel_heights = c(0.1, 1))

  ggpubr::annotate_figure(baseline_pred_all_figure,
                          left= ggpubr::text_grob("Observed Cognitive Performance (Z)",size=15,rot=90),
                        bottom = ggpubr::text_grob("Predicted Cognitive Performance (Z)",size=15)) 
}


random_forest_plotting_followup(rf_fit =rf_fit_all,followup_data=enet_rf_test_followup,naming_frame = resp_var_plotting, resp_vec= resp_names)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

random_forest_plotting_baseline(rf_fit =rf_fit_all,baseline_data=enet_rf_test_baseline,naming_frame = resp_var_plotting , resp_vec= resp_names)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

3 Relationship between MRI and G-factor: Confirmatory factor analysis

here cfa is used to compute g factor.
We fit two models (first order and second order models) onto the training data, and apply the fit to the validation (2nd-layer) and test sets (baseline and followup). This is to protect us from dataleakage.

3.1 set up CFA

# we only inlcuded task that are avaliable on two time points

TaskDVs1Batch = c("NIHTBX_PICVOCAB_UNCORRECTED", 
                  "NIHTBX_READING_UNCORRECTED",
              "NIHTBX_FLANKER_UNCORRECTED",
              "NIHTBX_PATTERN_UNCORRECTED",
              "NIHTBX_PICTURE_UNCORRECTED",
               "PEA_RAVLT_LD_TRIAL_VII_TC"
)


CFA_lav_output_processing <- function(data_input,CFA_model1,CFA_model2,CFA_model3){
first_order_output <- lavaan::lavPredict(CFA_model1, newdata =data_input )
second_order_output <- lavaan::lavPredict(CFA_model2, newdata = data_input)
single_factor_output <- lavaan::lavPredict(CFA_model3, newdata = data_input)

### getting SUBJECTKEY for CFA model prediction
### the number of columns of CFA predictions and task performance data matched
task_data <- select(data_input,all_of(TaskDVs1Batch),"SUBJECTKEY")%>% drop_na()

CFA_lav <- tibble(SUBJECTKEY =task_data$SUBJECTKEY, 
                  Lang_first_order = first_order_output[,1], 
                  CogFlex_first_order = first_order_output[,2], 
                  MemRe_first_order = first_order_output[,3],
                  g_second_order =second_order_output[,4],
                  g_single_factor =single_factor_output[,1])
return(CFA_lav)
}

CFA_lav_response <- function(data_input = baseline_two_fold$splits[[1]], 
                             test_baseline =baseline_test, 
                             test_followup=followup_analysis ){
  ## training the model and for enet parameter tuning
data_train <- analysis(data_input)%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
## random forest tuning
data_valid <- assessment(data_input)%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
##  test set for baseline observations
data_baseline <- test_baseline%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
##  test set for followup observations
data_followup <- test_followup%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
### first order CFA model

NeuroCog1stOrder <-'
Language =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED 
CognitiveFlexibity =~ NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED 
MemoryRecall =~ NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC'

NeuroCog1stOrder.Fit <- lavaan::cfa(model = NeuroCog1stOrder, data = data_train,estimator="MLR")

### second order CFA model

NeuroCog2ndOrder <-'
Language =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED 
CognitiveFlexibity =~ NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED 
MemoryRecall =~ NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC
g =~ NA*Language + CognitiveFlexibity  + MemoryRecall #estimate the loading of GenAbi -> as opposed to using it as a marker
g ~~ 1*g #need to constrain variance to 1'

NeuroCog2ndOrder.Fit <- lavaan::cfa(model = NeuroCog2ndOrder, data = data_train,estimator="MLR")

### single factor CFA model

NeuroCogSingleFactor <-'
g =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED + NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED + NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC'

NeuroCogSingleFactor.Fit <- lavaan::cfa(model = NeuroCogSingleFactor, data = data_train,estimator="MLR")

train_CFA_lav <- CFA_lav_output_processing(data_input = data_train,
                                           CFA_model1 = NeuroCog1stOrder.Fit,
                                           CFA_model2 = NeuroCog2ndOrder.Fit,
                                           CFA_model3 = NeuroCogSingleFactor.Fit)
valid_CFA_lav <- CFA_lav_output_processing(data_input = data_valid,
                                           CFA_model1 = NeuroCog1stOrder.Fit,
                                           CFA_model2 = NeuroCog2ndOrder.Fit,
                                           CFA_model3 = NeuroCogSingleFactor.Fit)
baseline_CFA_lav <- CFA_lav_output_processing(data_input = data_baseline,
                                              CFA_model1 = NeuroCog1stOrder.Fit,
                                              CFA_model2 = NeuroCog2ndOrder.Fit,
                                              CFA_model3 = NeuroCogSingleFactor.Fit)
followup_CFA_lav <- CFA_lav_output_processing(data_input = data_followup,
                                              CFA_model1 = NeuroCog1stOrder.Fit,
                                              CFA_model2 = NeuroCog2ndOrder.Fit,
                                              CFA_model3 = NeuroCogSingleFactor.Fit)

output_list <- list(enet_tuning = train_CFA_lav, 
                    RanFor_tuning = valid_CFA_lav, 
                    testing_baseline = baseline_CFA_lav,
                    testing_followup=followup_CFA_lav)

return(list(output_list, NeuroCog1stOrder.Fit, NeuroCog2ndOrder.Fit, NeuroCogSingleFactor.Fit))
}

CFA_list <- CFA_lav_response(data_input = baseline_two_fold$splits[[1]], 
                             test_baseline =baseline_test, 
                             test_followup=followup_analysis)
CFA_response <- CFA_list[[1]]
NeuroCog1stOrder.Fit <- CFA_list[[2]]
NeuroCog2ndOrder.Fit <- CFA_list[[3]]
NeuroCogSingleFactor.Fit <- CFA_list[[4]]

3.1.1 find the number of EFA factors using parallel analysis

the result is 3

data_train_fa <- analysis(baseline_two_fold$splits[[1]])%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()

data_train_fa %>%  select(all_of(TaskDVs1Batch)) %>%
  psych::fa.parallel(fm='ml', fa='fa')

## Parallel analysis suggests that the number of factors =  3  and the number of components =  NA

3.1.2 fitting Oblimin FA

Task.faObli <- 
  data_train_fa %>%  
  select(all_of(TaskDVs1Batch)) %>%
  psych::fa(nfactors = 3, #number of factors
            rotate = "oblimin", #rotation type
            fm = "ml", #math
            score = T)
## Loading required namespace: GPArotation
print(Task.faObli$loadings, 
      #cut =.3, 
      sort=TRUE)
## 
## Loadings:
##                             ML1    ML3    ML2   
## NIHTBX_PICVOCAB_UNCORRECTED  0.702              
## NIHTBX_READING_UNCORRECTED   0.730              
## NIHTBX_PICTURE_UNCORRECTED          0.688       
## PEA_RAVLT_LD_TRIAL_VII_TC    0.129  0.575       
## NIHTBX_FLANKER_UNCORRECTED   0.147         0.534
## NIHTBX_PATTERN_UNCORRECTED                 0.679
## 
##                  ML1   ML3   ML2
## SS loadings    1.070 0.805 0.750
## Proportion Var 0.178 0.134 0.125
## Cumulative Var 0.178 0.313 0.437
Task.faObli
## Factor Analysis using method =  ml
## Call: psych::fa(r = ., nfactors = 3, rotate = "oblimin", scores = T, 
##     fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                               ML1   ML3   ML2   h2   u2 com
## NIHTBX_PICVOCAB_UNCORRECTED  0.70  0.01  0.01 0.51 0.49 1.0
## NIHTBX_READING_UNCORRECTED   0.73  0.01  0.01 0.55 0.45 1.0
## NIHTBX_FLANKER_UNCORRECTED   0.15 -0.01  0.53 0.37 0.63 1.2
## NIHTBX_PATTERN_UNCORRECTED  -0.05  0.02  0.68 0.44 0.56 1.0
## NIHTBX_PICTURE_UNCORRECTED  -0.06  0.69  0.04 0.45 0.55 1.0
## PEA_RAVLT_LD_TRIAL_VII_TC    0.13  0.58 -0.04 0.41 0.59 1.1
## 
##                        ML1  ML3  ML2
## SS loadings           1.12 0.84 0.78
## Proportion Var        0.19 0.14 0.13
## Cumulative Var        0.19 0.33 0.46
## Proportion Explained  0.41 0.31 0.28
## Cumulative Proportion 0.41 0.72 1.00
## 
##  With factor correlations of 
##      ML1  ML3  ML2
## ML1 1.00 0.58 0.43
## ML3 0.58 1.00 0.42
## ML2 0.43 0.42 1.00
## 
## Mean item complexity =  1
## Test of the hypothesis that 3 factors are sufficient.
## 
## The degrees of freedom for the null model are  15  and the objective function was  1.02 with Chi Square of  2945.66
## The degrees of freedom for the model are 0  and the objective function was  0 
## 
## The root mean square of the residuals (RMSR) is  0 
## The df corrected root mean square of the residuals is  NA 
## 
## The harmonic number of observations is  2902 with the empirical chi square  0.01  with prob <  NA 
## The total number of observations was  2902  with Likelihood Chi Square =  0.01  with prob <  NA 
## 
## Tucker Lewis Index of factoring reliability =  -Inf
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    ML1  ML3  ML2
## Correlation of (regression) scores with factors   0.85 0.80 0.78
## Multiple R square of scores with factors          0.73 0.65 0.60
## Minimum correlation of possible factor scores     0.45 0.30 0.21

3.1.3 prep data for the EFA -> CFA model

## training the model and for enet parameter tuning
data_train <- analysis(baseline_two_fold$splits[[1]])%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
## random forest tuning
data_valid <- assessment(baseline_two_fold$splits[[1]])%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
##  test set for baseline observations
data_baseline <- baseline_test%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
##  test set for followup observations
data_followup <- followup_analysis%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()

3.1.4 build EFA -> CFA model using data_train

Task.faObli_data_train_pred <- psych::predict.psych(object = Task.faObli,
        data = data_train %>% select(all_of(TaskDVs1Batch)),
        old.data = data_train %>% select(all_of(TaskDVs1Batch))) %>%
        as_tibble() %>%
        rename(Language_EFA = ML1, 
               CognitiveFlexibity_EFA = ML2, 
               MemoryRecall_EFA = ML3)

NeuroCogEFA_CFA <-'
g =~ Language_EFA + MemoryRecall_EFA + CognitiveFlexibity_EFA'

NeuroCogEFA_CFA.Fit <- lavaan::cfa(model = NeuroCogEFA_CFA, 
                                   data = Task.faObli_data_train_pred,
                                   estimator="MLR")

CFA_response$enet_tuning <- 
  CFA_response$enet_tuning %>% bind_cols(
    lavaan::lavPredict(NeuroCogEFA_CFA.Fit, 
                       newdata =Task.faObli_data_train_pred)%>%
      as.double() %>% as_tibble() %>%
    rename(g_EFA_CFA = value))

3.1.5 apply the EFA -> CFA model to data valid, data baseline, data followup

## data valid
Task.faObli_data_valid_pred <- psych::predict.psych(object = Task.faObli,
        data = data_valid %>% select(all_of(TaskDVs1Batch)),
        old.data = data_train %>% select(all_of(TaskDVs1Batch))) %>%
        as_tibble() %>%
        rename(Language_EFA = ML1, 
               CognitiveFlexibity_EFA = ML2, 
               MemoryRecall_EFA = ML3)

CFA_response$RanFor_tuning <- 
  CFA_response$RanFor_tuning %>% bind_cols(
    lavaan::lavPredict(NeuroCogEFA_CFA.Fit, 
                       newdata =Task.faObli_data_valid_pred)%>%
    as.double() %>% as_tibble() %>%
    rename(g_EFA_CFA = value))

## data_baseline
Task.faObli_data_baseline_pred <- psych::predict.psych(object = Task.faObli,
        data = data_baseline %>% select(all_of(TaskDVs1Batch)),
        old.data = data_train %>% select(all_of(TaskDVs1Batch))) %>%
        as_tibble() %>%
        rename(Language_EFA = ML1, 
               CognitiveFlexibity_EFA = ML2, 
               MemoryRecall_EFA = ML3)

CFA_response$testing_baseline <- 
  CFA_response$testing_baseline %>% bind_cols(
    lavaan::lavPredict(NeuroCogEFA_CFA.Fit, 
                       newdata =Task.faObli_data_baseline_pred)%>%
    as.double() %>% as_tibble() %>%
    rename(g_EFA_CFA = value))

## data_follow up
Task.faObli_data_followup_pred <- psych::predict.psych(object = Task.faObli,
        data = data_followup %>% select(all_of(TaskDVs1Batch)),
        old.data = data_train %>% select(all_of(TaskDVs1Batch))) %>%
        as_tibble() %>%
        rename(Language_EFA = ML1, 
               CognitiveFlexibity_EFA = ML2, 
               MemoryRecall_EFA = ML3)

CFA_response$testing_followup <- 
  CFA_response$testing_followup %>% bind_cols(
    lavaan::lavPredict(NeuroCogEFA_CFA.Fit, 
                       newdata =Task.faObli_data_followup_pred)%>%
    as.double() %>% as_tibble() %>%
    rename(g_EFA_CFA = value))

3.1.6 look at the final EFA CFA model

lavaan::summary(NeuroCogEFA_CFA.Fit, standardized = TRUE, rsquare = TRUE, fit.measures = TRUE)
## lavaan 0.6-11 ended normally after 19 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                         6
##                                                       
##   Number of observations                          2902
##                                                       
## Model Test User Model:
##                                               Standard      Robust
##   Test Statistic                                 0.000       0.000
##   Degrees of freedom                                 0           0
## 
## Model Test Baseline Model:
## 
##   Test statistic                              3882.254    3837.255
##   Degrees of freedom                                 3           3
##   P-value                                        0.000       0.000
##   Scaling correction factor                                  1.012
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    1.000       1.000
##   Tucker-Lewis Index (TLI)                       1.000       1.000
##                                                                   
##   Robust Comparative Fit Index (CFI)                         1.000
##   Robust Tucker-Lewis Index (TLI)                            1.000
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)              -8584.296   -8584.296
##   Loglikelihood unrestricted model (H1)      -8584.296   -8584.296
##                                                                   
##   Akaike (AIC)                               17180.591   17180.591
##   Bayesian (BIC)                             17216.430   17216.430
##   Sample-size adjusted Bayesian (BIC)        17197.366   17197.366
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.000       0.000
##   90 Percent confidence interval - lower         0.000       0.000
##   90 Percent confidence interval - upper         0.000       0.000
##   P-value RMSEA <= 0.05                             NA          NA
##                                                                   
##   Robust RMSEA                                               0.000
##   90 Percent confidence interval - lower                     0.000
##   90 Percent confidence interval - upper                     0.000
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.000       0.000
## 
## Parameter Estimates:
## 
##   Standard errors                             Sandwich
##   Information bread                           Observed
##   Observed information based on                Hessian
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   g =~                                                                  
##     Language_EFA      1.000                               0.741    0.870
##     MemoryRcll_EFA    0.933    0.021   44.922    0.000    0.692    0.860
##     CgntvFlxbt_EFA    0.720    0.019   37.065    0.000    0.534    0.687
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .Language_EFA      0.177    0.011   16.639    0.000    0.177    0.244
##    .MemoryRcll_EFA    0.169    0.009   17.908    0.000    0.169    0.261
##    .CgntvFlxbt_EFA    0.318    0.010   32.873    0.000    0.318    0.528
##     g                 0.550    0.022   25.501    0.000    1.000    1.000
## 
## R-Square:
##                    Estimate
##     Language_EFA      0.756
##     MemoryRcll_EFA    0.739
##     CgntvFlxbt_EFA    0.472
Plabels = c("Language","Memory\nRecall","Mental\nFlexibity",expression(paste(italic("g"))))
semPlot::semPaths(object=NeuroCogEFA_CFA.Fit,
                  intercepts = F, residuals = F,
                  whatLabels="no",
                  what = "std",
                  layout="tree",
                  node.width = 1.4,
                  edge.label.cex = .6,
                  nodeLabels=Plabels,
                  edge.color="black",
                  sizeMan = 10, sizeLat = 10)

#semPlot::semPaths(NeuroCogEFA_CFA.Fit)

3.1.7 plot the EFA part

colnames(Task.faObli$loadings) <- c("Language","Memory\nRecall","Mental\nFlexibity")
rownames(Task.faObli$loadings) <- 
  c("Vocab","Reading","Flanker","Pattern","Picture","RAVLT")
psych::fa.diagram(Task.faObli)

Task.faObli
## Factor Analysis using method =  ml
## Call: psych::fa(r = ., nfactors = 3, rotate = "oblimin", scores = T, 
##     fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##         Language Memory\nRecall Mental\nFlexibity   h2   u2 com
## Vocab       0.70           0.01              0.01 0.51 0.49 1.0
## Reading     0.73           0.01              0.01 0.55 0.45 1.0
## Flanker     0.15          -0.01              0.53 0.37 0.63 1.2
## Pattern    -0.05           0.02              0.68 0.44 0.56 1.0
## Picture    -0.06           0.69              0.04 0.45 0.55 1.0
## RAVLT       0.13           0.58             -0.04 0.41 0.59 1.1
## 
##                       Language Memory\nRecall Mental\nFlexibity
## SS loadings               1.12           0.84              0.78
## Proportion Var            0.19           0.14              0.13
## Cumulative Var            0.19           0.33              0.46
## Proportion Explained      0.41           0.31              0.28
## Cumulative Proportion     0.41           0.72              1.00
## 
##  With factor correlations of 
##                   Language Memory\nRecall Mental\nFlexibity
## Language              1.00           0.58              0.43
## Memory\nRecall        0.58           1.00              0.42
## Mental\nFlexibity     0.43           0.42              1.00
## 
## Mean item complexity =  1
## Test of the hypothesis that 3 factors are sufficient.
## 
## The degrees of freedom for the null model are  15  and the objective function was  1.02 with Chi Square of  2945.66
## The degrees of freedom for the model are 0  and the objective function was  0 
## 
## The root mean square of the residuals (RMSR) is  0 
## The df corrected root mean square of the residuals is  NA 
## 
## The harmonic number of observations is  2902 with the empirical chi square  0.01  with prob <  NA 
## The total number of observations was  2902  with Likelihood Chi Square =  0.01  with prob <  NA 
## 
## Tucker Lewis Index of factoring reliability =  -Inf
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                   Language Memory\nRecall
## Correlation of (regression) scores with factors       0.85           0.80
## Multiple R square of scores with factors              0.73           0.65
## Minimum correlation of possible factor scores         0.45           0.30
##                                                   Mental\nFlexibity
## Correlation of (regression) scores with factors                0.78
## Multiple R square of scores with factors                       0.60
## Minimum correlation of possible factor scores                  0.21

3.1.8 plot and provide sum stats of the 1st order model

lavaan::summary(NeuroCog1stOrder.Fit, standardized = TRUE, rsquare = TRUE, fit.measures = TRUE)
## lavaan 0.6-11 ended normally after 31 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        15
##                                                       
##   Number of observations                          2902
##                                                       
## Model Test User Model:
##                                                Standard      Robust
##   Test Statistic                                 20.462      20.029
##   Degrees of freedom                                  6           6
##   P-value (Chi-square)                            0.002       0.003
##   Scaling correction factor                                   1.022
##        Yuan-Bentler correction (Mplus variant)                     
## 
## Model Test Baseline Model:
## 
##   Test statistic                              2949.557    2772.060
##   Degrees of freedom                                15          15
##   P-value                                        0.000       0.000
##   Scaling correction factor                                  1.064
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.995       0.995
##   Tucker-Lewis Index (TLI)                       0.988       0.987
##                                                                   
##   Robust Comparative Fit Index (CFI)                         0.995
##   Robust Tucker-Lewis Index (TLI)                            0.988
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)             -23239.010  -23239.010
##   Scaling correction factor                                  1.121
##       for the MLR correction                                      
##   Loglikelihood unrestricted model (H1)     -23228.779  -23228.779
##   Scaling correction factor                                  1.093
##       for the MLR correction                                      
##                                                                   
##   Akaike (AIC)                               46508.020   46508.020
##   Bayesian (BIC)                             46597.617   46597.617
##   Sample-size adjusted Bayesian (BIC)        46549.957   46549.957
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.029       0.028
##   90 Percent confidence interval - lower         0.016       0.015
##   90 Percent confidence interval - upper         0.043       0.042
##   P-value RMSEA <= 0.05                          0.994       0.996
##                                                                   
##   Robust RMSEA                                               0.029
##   90 Percent confidence interval - lower                     0.015
##   90 Percent confidence interval - upper                     0.043
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.014       0.014
## 
## Parameter Estimates:
## 
##   Standard errors                             Sandwich
##   Information bread                           Observed
##   Observed information based on                Hessian
## 
## Latent Variables:
##                         Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   Language =~                                                                
##     NIHTBX_PICVOCA         1.000                               0.712    0.712
##     NIHTBX_READING         1.042    0.048   21.683    0.000    0.742    0.742
##   CognitiveFlexibity =~                                                      
##     NIHTBX_FLANKER         1.000                               0.721    0.721
##     NIHTBX_PATTERN         0.749    0.058   12.814    0.000    0.541    0.541
##   MemoryRecall =~                                                            
##     NIHTBX_PICTURE         1.000                               0.603    0.603
##     PEA_RAVLT_LD_T         1.152    0.068   16.872    0.000    0.694    0.695
## 
## Covariances:
##                         Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   Language ~~                                                                
##     CognitivFlxbty         0.266    0.019   14.102    0.000    0.518    0.518
##     MemoryRecall           0.272    0.017   15.811    0.000    0.633    0.633
##   CognitiveFlexibity ~~                                                      
##     MemoryRecall           0.193    0.017   11.130    0.000    0.443    0.443
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .NIHTBX_PICVOCA    0.493    0.025   19.590    0.000    0.493    0.493
##    .NIHTBX_READING    0.449    0.026   17.058    0.000    0.449    0.449
##    .NIHTBX_FLANKER    0.479    0.040   12.110    0.000    0.479    0.479
##    .NIHTBX_PATTERN    0.707    0.030   23.896    0.000    0.707    0.708
##    .NIHTBX_PICTURE    0.637    0.026   24.382    0.000    0.637    0.637
##    .PEA_RAVLT_LD_T    0.517    0.032   16.296    0.000    0.517    0.518
##     Language          0.507    0.031   16.267    0.000    1.000    1.000
##     CognitivFlxbty    0.520    0.047   11.106    0.000    1.000    1.000
##     MemoryRecall      0.363    0.028   12.876    0.000    1.000    1.000
## 
## R-Square:
##                    Estimate
##     NIHTBX_PICVOCA    0.507
##     NIHTBX_READING    0.551
##     NIHTBX_FLANKER    0.521
##     NIHTBX_PATTERN    0.292
##     NIHTBX_PICTURE    0.363
##     PEA_RAVLT_LD_T    0.482
Plabels = c("Vocab","Reading","Flanker","Pattern","Picture","RAVLT","Language","Mental\nFlexibity","Memory\nRecall")
semPlot::semPaths(object=NeuroCog1stOrder.Fit,
                  intercepts = F, residuals = F, 
                  whatLabels="no", 
                  what = "std", 
                  layout="tree",
                  node.width = 1.4,
                  edge.label.cex = .6, 
                  nodeLabels=Plabels, 
                  edge.color="black",
                  sizeMan = 10, sizeLat = 10)

3.1.9 plot and provide sum stats of the 2nd-order model

lavaan::summary(NeuroCog2ndOrder.Fit, standardized = TRUE, rsquare = TRUE, fit.measures = TRUE)
## lavaan 0.6-11 ended normally after 29 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        15
##                                                       
##   Number of observations                          2902
##                                                       
## Model Test User Model:
##                                                Standard      Robust
##   Test Statistic                                 20.462      20.029
##   Degrees of freedom                                  6           6
##   P-value (Chi-square)                            0.002       0.003
##   Scaling correction factor                                   1.022
##        Yuan-Bentler correction (Mplus variant)                     
## 
## Model Test Baseline Model:
## 
##   Test statistic                              2949.557    2772.060
##   Degrees of freedom                                15          15
##   P-value                                        0.000       0.000
##   Scaling correction factor                                  1.064
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.995       0.995
##   Tucker-Lewis Index (TLI)                       0.988       0.987
##                                                                   
##   Robust Comparative Fit Index (CFI)                         0.995
##   Robust Tucker-Lewis Index (TLI)                            0.988
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)             -23239.010  -23239.010
##   Scaling correction factor                                  1.121
##       for the MLR correction                                      
##   Loglikelihood unrestricted model (H1)     -23228.779  -23228.779
##   Scaling correction factor                                  1.093
##       for the MLR correction                                      
##                                                                   
##   Akaike (AIC)                               46508.020   46508.020
##   Bayesian (BIC)                             46597.617   46597.617
##   Sample-size adjusted Bayesian (BIC)        46549.957   46549.957
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.029       0.028
##   90 Percent confidence interval - lower         0.016       0.015
##   90 Percent confidence interval - upper         0.043       0.042
##   P-value RMSEA <= 0.05                          0.994       0.996
##                                                                   
##   Robust RMSEA                                               0.029
##   90 Percent confidence interval - lower                     0.015
##   90 Percent confidence interval - upper                     0.043
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.014       0.014
## 
## Parameter Estimates:
## 
##   Standard errors                             Sandwich
##   Information bread                           Observed
##   Observed information based on                Hessian
## 
## Latent Variables:
##                         Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   Language =~                                                                
##     NIHTBX_PICVOCA         1.000                               0.712    0.712
##     NIHTBX_READING         1.042    0.048   21.683    0.000    0.742    0.742
##   CognitiveFlexibity =~                                                      
##     NIHTBX_FLANKER         1.000                               0.721    0.721
##     NIHTBX_PATTERN         0.749    0.058   12.814    0.000    0.541    0.541
##   MemoryRecall =~                                                            
##     NIHTBX_PICTURE         1.000                               0.603    0.603
##     PEA_RAVLT_LD_T         1.152    0.068   16.872    0.000    0.694    0.695
##   g =~                                                                       
##     Language               0.613    0.029   20.816    0.000    0.860    0.860
##     CognitivFlxbty         0.434    0.025   17.213    0.000    0.602    0.602
##     MemoryRecall           0.444    0.027   16.494    0.000    0.736    0.736
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##     g                 1.000                               1.000    1.000
##    .NIHTBX_PICVOCA    0.493    0.025   19.590    0.000    0.493    0.493
##    .NIHTBX_READING    0.449    0.026   17.058    0.000    0.449    0.449
##    .NIHTBX_FLANKER    0.479    0.040   12.110    0.000    0.479    0.479
##    .NIHTBX_PATTERN    0.707    0.030   23.896    0.000    0.707    0.708
##    .NIHTBX_PICTURE    0.637    0.026   24.382    0.000    0.637    0.637
##    .PEA_RAVLT_LD_T    0.517    0.032   16.296    0.000    0.517    0.518
##    .Language          0.132    0.029    4.505    0.000    0.260    0.260
##    .CognitivFlxbty    0.332    0.039    8.535    0.000    0.637    0.637
##    .MemoryRecall      0.166    0.020    8.361    0.000    0.458    0.458
## 
## R-Square:
##                    Estimate
##     NIHTBX_PICVOCA    0.507
##     NIHTBX_READING    0.551
##     NIHTBX_FLANKER    0.521
##     NIHTBX_PATTERN    0.292
##     NIHTBX_PICTURE    0.363
##     PEA_RAVLT_LD_T    0.482
##     Language          0.740
##     CognitivFlxbty    0.363
##     MemoryRecall      0.542
Plabels = c("Vocab","Reading","Flanker","Pattern","Picture","RAVLT","Language","Mental\nFlexibity","Memory\nRecall",expression(paste(italic("g"))))
semPlot::semPaths(object=NeuroCog2ndOrder.Fit,intercepts = F, residuals = F, 
        whatLabels="no", what = "std", layout="tree", node.width = 1.4,
         edge.label.cex = .6, nodeLabels=Plabels, edge.color="black",
        sizeMan = 10, sizeLat = 10)

3.1.10 reliabity of the G-Factor

semTools::reliabilityL2(NeuroCog2ndOrder.Fit,"g")
##        omegaL1        omegaL2 partialOmegaL1 
##      0.6103282      0.7792178      0.7282842

3.1.11 plot and provide sum stats of the single factor model

lavaan::summary(NeuroCogSingleFactor.Fit, standardized = TRUE, rsquare = TRUE, fit.measures = TRUE)
## lavaan 0.6-11 ended normally after 20 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        12
##                                                       
##   Number of observations                          2902
##                                                       
## Model Test User Model:
##                                                Standard      Robust
##   Test Statistic                                513.874     510.350
##   Degrees of freedom                                  9           9
##   P-value (Chi-square)                            0.000       0.000
##   Scaling correction factor                                   1.007
##        Yuan-Bentler correction (Mplus variant)                     
## 
## Model Test Baseline Model:
## 
##   Test statistic                              2949.557    2772.060
##   Degrees of freedom                                15          15
##   P-value                                        0.000       0.000
##   Scaling correction factor                                  1.064
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.828       0.818
##   Tucker-Lewis Index (TLI)                       0.713       0.697
##                                                                   
##   Robust Comparative Fit Index (CFI)                         0.828
##   Robust Tucker-Lewis Index (TLI)                            0.713
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)             -23485.716  -23485.716
##   Scaling correction factor                                  1.157
##       for the MLR correction                                      
##   Loglikelihood unrestricted model (H1)     -23228.779  -23228.779
##   Scaling correction factor                                  1.093
##       for the MLR correction                                      
##                                                                   
##   Akaike (AIC)                               46995.431   46995.431
##   Bayesian (BIC)                             47067.109   47067.109
##   Sample-size adjusted Bayesian (BIC)        47028.981   47028.981
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.139       0.139
##   90 Percent confidence interval - lower         0.129       0.128
##   90 Percent confidence interval - upper         0.149       0.149
##   P-value RMSEA <= 0.05                          0.000       0.000
##                                                                   
##   Robust RMSEA                                               0.139
##   90 Percent confidence interval - lower                     0.129
##   90 Percent confidence interval - upper                     0.149
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.070       0.070
## 
## Parameter Estimates:
## 
##   Standard errors                             Sandwich
##   Information bread                           Observed
##   Observed information based on                Hessian
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   g =~                                                                  
##     NIHTBX_PICVOCA    1.000                               0.668    0.668
##     NIHTBX_READING    1.028    0.035   29.109    0.000    0.687    0.687
##     NIHTBX_FLANKER    0.670    0.045   14.852    0.000    0.447    0.447
##     NIHTBX_PATTERN    0.537    0.042   12.642    0.000    0.358    0.358
##     NIHTBX_PICTURE    0.706    0.046   15.506    0.000    0.472    0.472
##     PEA_RAVLT_LD_T    0.787    0.045   17.566    0.000    0.526    0.526
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .NIHTBX_PICVOCA    0.554    0.024   23.532    0.000    0.554    0.554
##    .NIHTBX_READING    0.528    0.025   21.063    0.000    0.528    0.528
##    .NIHTBX_FLANKER    0.800    0.026   30.533    0.000    0.800    0.800
##    .NIHTBX_PATTERN    0.871    0.025   34.266    0.000    0.871    0.872
##    .NIHTBX_PICTURE    0.777    0.024   32.574    0.000    0.777    0.777
##    .PEA_RAVLT_LD_T    0.723    0.026   28.200    0.000    0.723    0.724
##     g                 0.446    0.029   15.272    0.000    1.000    1.000
## 
## R-Square:
##                    Estimate
##     NIHTBX_PICVOCA    0.446
##     NIHTBX_READING    0.472
##     NIHTBX_FLANKER    0.200
##     NIHTBX_PATTERN    0.128
##     NIHTBX_PICTURE    0.223
##     PEA_RAVLT_LD_T    0.276
Plabels = c("Vocab","Reading","Flanker","Pattern","Picture","RAVLT", expression(paste(italic("g"))))
semPlot::semPaths(object=NeuroCogSingleFactor.Fit,
                  intercepts = F, residuals = F, 
                  whatLabels="no", 
                  what = "std", 
                  layout="tree",
                  node.width = 1.4,
                  edge.label.cex = .6, 
                  nodeLabels=Plabels, 
                  edge.color="black",
                  sizeMan = 10, sizeLat = 10)

3.1.12 testing the stability of g-factor

data_first_year <- bind_rows(data_train,data_valid,data_baseline)

NeuroCog2ndOrder <-'
Language =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED
CognitiveFlexibity =~ NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED
MemoryRecall =~ NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC
g =~ NA*Language + CognitiveFlexibity  + MemoryRecall #estimate the loading of GenAbi -> as opposed to using it as a marker
g ~~ 1*g #need to constrain variance to 1'

NeuroCog2ndOrder.Fit_First_year <- lavaan::cfa(model = NeuroCog2ndOrder, 
                                               data = data_first_year,
                                               estimator="MLR")

CFA_NeuroCog2ndOrder_first_year <- 
   data_first_year %>% select(SUBJECTKEY) %>%
   bind_cols(
    lavaan::lavPredict(NeuroCog2ndOrder.Fit_First_year, 
                       newdata =data_first_year)  %>% as_tibble() %>% 
    rename(g_2ndOrder_first_year = g)) %>%
  select(SUBJECTKEY, g_2ndOrder_first_year)

CFA_all_first_year <- 
  CFA_response$enet_tuning %>% bind_cols(fold = "first_layer") %>%
  bind_rows(CFA_response$RanFor_tuning %>% bind_cols(fold = "second_layer") %>%
              bind_rows(CFA_response$testing_baseline %>% bind_cols(fold = "baseline_test"))) %>%
  left_join(CFA_NeuroCog2ndOrder_first_year, by = "SUBJECTKEY")

3.1.13 correlation in 2nd-order g between those based on first layer train vs. whole baseline

CFA_all_first_year %>% 
  group_by(fold) %>%
  summarize (cor=cor (g_second_order, g_2ndOrder_first_year))
## # A tibble: 3 x 2
##   fold            cor
##   <chr>         <dbl>
## 1 baseline_test 0.997
## 2 first_layer   0.997
## 3 second_layer  0.997
fold.labs <- c("1st-layer train\nr = .9973686", "2nd-layer train\nr = .9973635", "Baseline Test\nr = .9970127")
names(fold.labs) <- c("first_layer", "second_layer","baseline_test" )

CFA_all_first_year %>%
  mutate(fold=factor(fold, levels=c("first_layer", "second_layer","baseline_test"))) %>%
  ggplot(aes(x=g_second_order, y=g_2ndOrder_first_year)) +
  geom_point() + 
  geom_smooth(method=lm, color = "grey") + 
  facet_wrap(~fold, ncol = 3, nrow = 1,
             labeller = labeller(fold = fold.labs)) +
  labs(x = expression(paste("2nd-order ",italic("g"), "-Factor Based on 1st-Layer Train Data")),
       y = ~ atop(paste("2nd-order ",italic("g"),"-Factor"), "Based on Full Baseline Data")) +
   theme(legend.position = "none") +
   #ggpubr::stat_cor(method = "pearson") + #label.x = -5, label.y = 30)
   theme(text=element_text(size=19)) 
## Don't know how to automatically pick scale for object of type lavaan.matrix/matrix. Defaulting to continuous.
## `geom_smooth()` using formula 'y ~ x'

lowerFn <- function(data, mapping, continuous = "cor", ...) {
  p <- ggplot(data = data, mapping = mapping) +
    geom_point(colour = "black") +
    geom_smooth(color = "grey", ...)
  p
}

CFA_all_first_year_g <- CFA_all_first_year %>%
  select(g_second_order,g_single_factor,g_EFA_CFA)

colnames(CFA_all_first_year_g) <- 
  make.names(c('2nd Order', 'Single Factor', 'EFA CFA'))

g_first_layer_plot <- CFA_all_first_year %>%
  filter(fold == "first_layer") %>%
  select(g_second_order,g_single_factor,g_EFA_CFA) %>%
  GGally::ggpairs(.,
                  title = "1st-Layer Train Data", 
                  columnLabels = gsub('X', '', gsub('.', '\n', colnames(CFA_all_first_year_g), fixed = T)), 
                  lower = list(continuous = lowerFn)) 
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
g_second_layer_plot <- CFA_all_first_year %>%
  filter(fold == "second_layer") %>%
  select(g_second_order,g_single_factor,g_EFA_CFA) %>%
  GGally::ggpairs(.,
                  title = "2nd-Layer Train Data", 
                  columnLabels = gsub('X', '', gsub('.', '\n', colnames(CFA_all_first_year_g), fixed = T)), 
                  lower = list(continuous = lowerFn)) 

g_baseline_test_plot <- CFA_all_first_year %>%
  filter(fold == "baseline_test") %>%
  select(g_second_order,g_single_factor,g_EFA_CFA) %>%
  GGally::ggpairs(.,
                  title = "Baseline Test Data",
                  columnLabels = gsub('X', '', gsub('.', '\n', colnames(CFA_all_first_year_g), fixed = T)), 
                  lower = list(continuous = lowerFn))

g_followup_test_plot <- 
CFA_response$testing_followup %>%
  select(g_second_order,g_single_factor,g_EFA_CFA) %>%
  GGally::ggpairs(.,
                  title = "Follow-up Test Data",
                  columnLabels = gsub('X', '', gsub('.', '\n', colnames(CFA_all_first_year_g), fixed = T)), 
                  lower = list(continuous = lowerFn))

G_plot_grid <- cowplot::plot_grid(
  GGally::ggmatrix_gtable(g_first_layer_plot),
  GGally::ggmatrix_gtable(g_second_layer_plot),
  GGally::ggmatrix_gtable(g_baseline_test_plot),
  GGally::ggmatrix_gtable(g_followup_test_plot),
 # ggmatrix_gtable(g3),
  nrow = 2
) 
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# now add the title
title <- ggdraw() + 
  draw_label(
    expression(paste("Relationship among ",italic("g"), "-Factor Based on Different CFA models at Different Data Splits")),
    fontface = 'bold',
    x = 0,
    hjust = 0
  ) +
  theme(
    # add margin on the left of the drawing canvas,
    # so title is aligned with left edge of first plot
    plot.margin = margin(0, 0, 0, 7)
  )
plot_grid(
  title, G_plot_grid,
  ncol = 1,
  # rel_heights values control vertical title margins
  rel_heights = c(0.05, 1)
)

3.1.14 Joining extracted factor scores with the data:

#CFA_Resp_Var <- names(CFA_response[["enet_tuning"]])[-1]%>% set_names()
## to focus on g_second_order, remove g_single_factora and g_EFA_CFA
CFA_Resp_Var <- names(CFA_response[["enet_tuning"]] %>%
                        select(-g_single_factor, -g_EFA_CFA))[-1]%>% set_names()


train_CFA <- left_join(CFA_response[["enet_tuning"]],
                       analysis( baseline_two_fold$splits[[1]]), 
                       by = "SUBJECTKEY")
valid_CFA <- left_join(CFA_response[["RanFor_tuning"]],
                       assessment( baseline_two_fold$splits[[1]]), 
                       by = "SUBJECTKEY")
baseline_CFA <- left_join(CFA_response[["testing_baseline"]],
                          baseline_test, 
                          by = "SUBJECTKEY")
followup_CFA <- left_join(CFA_response[["testing_followup"]],
                          followup_analysis, 
                          by = "SUBJECTKEY")

3.1.15 look at the number

nrow(train_CFA)
## [1] 2906
nrow(valid_CFA)
## [1] 2917
nrow(baseline_CFA )
## [1] 5427
nrow(followup_CFA )
## [1] 5280

3.1.16 Plot distribution of G factor

GDistData <- data.table::data.table(G_Factor = train_CFA$g_second_order, Split = "1st-Layer Train") %>%
  rbind(data.table::data.table(G_Factor = valid_CFA$g_second_order, Split = "2nd-Layer Train")) %>%
  rbind(data.table::data.table(G_Factor = baseline_CFA$g_second_order, Split = "Baseline Test")) %>%
  rbind(data.table::data.table(G_Factor = followup_CFA$g_second_order, Split = "Follow-Up Test")) 

ggplot(data = GDistData, aes(x = G_Factor, color = Split)) + 
  geom_density() +
  labs(x= expression(paste("2nd-order ",italic("g")))) +
  labs(y= "Density") +
  theme(legend.title=element_blank()) +
  theme(text = element_text(size=30)) +
  theme(legend.position="top", legend.box="vertical", legend.margin=margin()) +
  guides(color=guide_legend(nrow=2,byrow=TRUE)) 

3.1.17 set CFA variable names for ploting

CFA_var_plotting <- tibble("response" =CFA_Resp_Var, 
                           "longer_name"=c("1st-Order Language", 
                                           "1st-Order Mental Flexibility", 
                                           "1st-Order Memory Recall", 
                                           "2nd-Order G-Factor"),
                           "short_name"=c("Language", 
                                          "Mental Flexibility", 
                                          "Memory Recall",
                                          "G-Factor"))

3.2 model fitting for Confirmatory factor analysis

3.2.1 data preping for elastic net for factor scores

Nback_enet_CFA <- CFA_Resp_Var %>% 
  map(.,~enet_processing_nocom(resp_var = .,
                               split_train = train_CFA, 
                               split_valid =  valid_CFA, 
                               split_test = baseline_CFA,
                               followup_test = followup_CFA,
                               contrast_name = "X2backvs0back_ROI_"))

SST_enet_CFA <- CFA_Resp_Var %>% 
  map(.,~enet_processing_nocom(resp_var = .,
                               split_train = train_CFA, 
                               split_valid =  valid_CFA, 
                               split_test = baseline_CFA,
                               followup_test = followup_CFA ,
                               contrast_name = "anystopvscorrectgo_ROI_"))

MID_enet_CFA <- CFA_Resp_Var %>% 
  map(.,~enet_processing_nocom(resp_var = .,
                               split_train = train_CFA, 
                               split_valid =  valid_CFA, 
                               split_test = baseline_CFA,
                               followup_test = followup_CFA,
                               contrast_name = "antiRewVsNeu_ROI_"))

DTI_enet_CFA <-  CFA_Resp_Var %>% 
  map(.,~enet_processing_comOne(resp_var = .,
                                split_train = train_CFA, 
                                split_valid =  valid_CFA, 
                                split_test = baseline_CFA,
                                followup_test = followup_CFA,
                                measure_one = "FA_"))

smri_enet_CFA <- CFA_Resp_Var %>% 
  map(.,~enet_processing_com(resp_var = .,
                             split_train = train_CFA, 
                             split_valid =  valid_CFA, 
                             split_test = baseline_CFA,
                             followup_test = followup_CFA,
                             measure_one = "ASEG_",
                             measure_two = "Dest_Thick_"))

rsmri_enet_CFA <- CFA_Resp_Var %>% 
  map(.,~enet_processing_com(resp_var = .,
                             split_train = train_CFA, 
                             split_valid =  valid_CFA, 
                             split_test = baseline_CFA,
                             followup_test = followup_CFA,
                             measure_one = "par",
                             measure_two = "Within"))

enet_CFA_data <- list("Nback"=Nback_enet_CFA,
                      "SST"=SST_enet_CFA,
                      "MID"=MID_enet_CFA,
                      "rsmri"=rsmri_enet_CFA,
                      "smri"=smri_enet_CFA,
                      "DTI"=DTI_enet_CFA)

3.2.2 actual parameter tuning for elastic net for factor scores

##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)

Nback_CFA <- CFA_Resp_Var %>% 
  future_map(.,~enet_tuning(resp_var =.,
                            data_list = Nback_enet_CFA[[.]]),
             .options = furrr_options(seed=TRUE))

SST_CFA <- CFA_Resp_Var %>% 
  future_map(.,~enet_tuning(resp_var =.,
                            data_list = SST_enet_CFA[[.]]),
             .options = furrr_options(seed=TRUE))

MID_CFA <- CFA_Resp_Var %>% 
  future_map(.,~enet_tuning(resp_var =.,
                            data_list = MID_enet_CFA[[.]]),
             .options = furrr_options(seed=TRUE))

DTI_CFA <-  CFA_Resp_Var %>% 
  future_map(.,~enet_tuning(resp_var =.,
                            data_list = DTI_enet_CFA[[.]]),
             .options = furrr_options(seed=TRUE))

smri_CFA <- CFA_Resp_Var %>% 
  future_map(.,~enet_tuning(resp_var =.,
                            data_list = smri_enet_CFA[[.]]),
             .options = furrr_options(seed=TRUE))

rsmri_CFA <- CFA_Resp_Var %>% 
  future_map(.,~enet_tuning(resp_var =.,
                            data_list = rsmri_enet_CFA[[.]]),
             .options = furrr_options(seed=TRUE))

enet_CFA <- list("Nback_results"=Nback_CFA,
                 "SST_results"=SST_CFA,
                 "MID_results"=MID_CFA,
                 "rsmri_results"=rsmri_CFA,
                 "smri_results"=smri_CFA,
                 "DTI_results"=DTI_CFA)

3.2.3 create table of the number of subjects in each data splits

datasplits_number(data_list = enet_CFA,resp_vec = CFA_Resp_Var, resp_plotting = CFA_var_plotting)
[[1]]
Sample size of all the data sets
modality enet_train stack_train baseline followup short_name
Nback 1369 1331 2653 2996 Language
Nback 1369 1331 2653 2996 Mental Flexibility
Nback 1369 1331 2653 2996 Memory Recall
Nback 1369 1331 2653 2996 G-Factor
[[2]]
Sample size of all the data sets
modality enet_train stack_train baseline followup short_name
SST 1412 1421 2791 2903 Language
SST 1412 1421 2791 2903 Mental Flexibility
SST 1412 1421 2791 2903 Memory Recall
SST 1412 1421 2791 2903 G-Factor
[[3]]
Sample size of all the data sets
modality enet_train stack_train baseline followup short_name
MID 1611 1591 3138 3127 Language
MID 1611 1591 3138 3127 Mental Flexibility
MID 1611 1591 3138 3127 Memory Recall
MID 1611 1591 3138 3127 G-Factor
[[4]]
Sample size of all the data sets
modality enet_train stack_train baseline followup short_name
rsmri 1988 2007 3779 4137 Language
rsmri 1988 2007 3779 4137 Mental Flexibility
rsmri 1988 2007 3779 4137 Memory Recall
rsmri 1988 2007 3779 4137 G-Factor
[[5]]
Sample size of all the data sets
modality enet_train stack_train baseline followup short_name
smri 2395 2428 4507 4569 Language
smri 2395 2428 4507 4569 Mental Flexibility
smri 2395 2428 4507 4569 Memory Recall
smri 2395 2428 4507 4569 G-Factor
[[6]]
Sample size of all the data sets
modality enet_train stack_train baseline followup short_name
DTI 2268 2260 4071 4230 Language
DTI 2268 2260 4071 4230 Mental Flexibility
DTI 2268 2260 4071 4230 Memory Recall
DTI 2268 2260 4071 4230 G-Factor
modality_enet_CFA <-names(enet_CFA)

param_tune_enet <- modality_enet_CFA %>% 
  map(.,function(modality_input=.,list_input=enet_CFA){
  tuned_results <- list_input[[modality_input]][["g_second_order"]][["param_tune"]]%>%
                   mutate(modality=modality_input)
  return(tuned_results)
})%>% do.call(rbind,.)%>%
  select("penalty", "mixture","modality")

 param_tune_enet %>%
  kableExtra::kbl(caption = "Tuned parameters for elastic net") %>%
                                     kableExtra::kable_classic(full_width = F, html_font = "Cambria") 
Tuned parameters for elastic net
penalty mixture modality
1.1489510 0.0 Nback_results
0.0195640 1.0 SST_results
0.0252354 1.0 MID_results
0.6080224 0.0 rsmri_results
0.1162322 0.1 smri_results
0.1162322 0.0 DTI_results

3.2.4 look at the mean and sd of penalty

# mean of penalty term 0.6190428    
param_tune_enet %>% summarise_at("penalty", mean)
## # A tibble: 1 x 1
##   penalty
##     <dbl>
## 1   0.339
# sd of penalty term 0.4316607  
param_tune_enet %>% summarise_at("penalty", sd)
## # A tibble: 1 x 1
##   penalty
##     <dbl>
## 1   0.453

3.2.5 Fit the tuned enet model to enetXplorer for the factor scores

enetXplorer allows us to use permutation to test the significance of each feature

enet_CFA_xplorer <- enet_CFA
names(enet_CFA_xplorer) <- modality_vec
##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)

enet_CFA_fit <- modality_vec  %>%
  map(.,~fitting_enetxplorer(resp_vec = CFA_Resp_Var,
                             modality_var = .,
                             data_input = enet_CFA_data,
                             tuning_input = enet_CFA_xplorer))

names(enet_CFA_fit)<- modality_vec

3.2.6 results of enetXplorer for each factor score

modality_vec %>% map(.,~enet_fit_table(modality_input=.,
                                       fit_input = enet_CFA_fit, 
                                       tune_input = enet_CFA_xplorer,
                                       resp_input=CFA_Resp_Var, 
                                       plotting_names = CFA_var_plotting))
[[1]]
EnetXplorer fitting of Nback
Alpha Best-tune lambda Accuracy (Pearson r) P-value respones
0 1.1486 0.4058 0.0004 Language
0 2.0159 0.2895 0.0004 Mental Flexibility
0 2.1021 0.3251 0.0004 Memory Recall
0 1.2884 0.4097 0.0004 G-Factor
[[2]]
EnetXplorer fitting of SST
Alpha Best-tune lambda Accuracy (Pearson r) P-value respones
1.0 0.0076 0.1731 0.0004 Language
1.0 0.0211 0.1447 0.0004 Mental Flexibility
0.1 0.0087 0.1218 0.0012 Memory Recall
1.0 0.0071 0.1784 0.0004 G-Factor
[[3]]
EnetXplorer fitting of MID
Alpha Best-tune lambda Accuracy (Pearson r) P-value respones
1 0.0284 0.2756 0.0004 Language
0 2.2486 0.1614 0.0004 Mental Flexibility
0 1.5746 0.2128 0.0004 Memory Recall
1 0.0236 0.2649 0.0004 G-Factor
[[4]]
EnetXplorer fitting of rsmri
Alpha Best-tune lambda Accuracy (Pearson r) P-value respones
0 0.3352 0.2410 0.0004 Language
0 1.2950 0.1434 0.0004 Mental Flexibility
0 1.2108 0.1681 0.0004 Memory Recall
0 0.4319 0.2281 0.0004 G-Factor
[[5]]
EnetXplorer fitting of smri
Alpha Best-tune lambda Accuracy (Pearson r) P-value respones
0.1 0.0913 0.2483 0.0004 Language
0.0 2.6983 0.1540 0.0004 Mental Flexibility
0.0 2.0348 0.1881 0.0004 Memory Recall
0.1 0.1134 0.2387 0.0004 G-Factor
[[6]]
EnetXplorer fitting of DTI
Alpha Best-tune lambda Accuracy (Pearson r) P-value respones
0 0.0128 0.2370 0.0004 Language
1 0.0002 0.1776 0.0004 Mental Flexibility
0 0.0394 0.1758 0.0004 Memory Recall
0 0.0134 0.2292 0.0004 G-Factor

3.2.7 Plot results from eNetExplorer with task-fMRI predicting each factor score

map2(.x=modality_type_vec,
     .y=str_type_vec,
     ~enet_interval_plot(data_input=enet_CFA_fit,
                         tuning_input=enet_CFA_xplorer,
                         resp_input =CFA_Resp_Var,
                         modality_input = .x,
                         str_input = .y, 
                         plotting_names = CFA_var_plotting))
## [[1]]
## [[1]]$Lang_first_order

## 
## [[1]]$CogFlex_first_order

## 
## [[1]]$MemRe_first_order

## 
## [[1]]$g_second_order

## 
## 
## [[2]]
## [[2]]$Lang_first_order

## 
## [[2]]$CogFlex_first_order

## 
## [[2]]$MemRe_first_order

## 
## [[2]]$g_second_order

## 
## 
## [[3]]
## [[3]]$Lang_first_order

## 
## [[3]]$CogFlex_first_order

## 
## [[3]]$MemRe_first_order

## 
## [[3]]$g_second_order

3.2.8 Plot results from eNetExplorer with sMRI predicting each factor score

enet_interval_plot_smri(data_input=enet_CFA_fit,tuning_input=enet_CFA,resp_input = CFA_Resp_Var, plotting_names = CFA_var_plotting)
## $Lang_first_order

## 
## $CogFlex_first_order

## 
## $MemRe_first_order

## 
## $g_second_order

### Plot results from eNetExplorer with rs-fMRI predicting each factor score

enet_interval_plot_rsmri(data_input=enet_CFA_fit,tuning_input=enet_CFA,resp_input = CFA_Resp_Var, plotting_names = CFA_var_plotting)
## $Lang_first_order

## 
## $CogFlex_first_order

## 
## $MemRe_first_order

## 
## $g_second_order

3.2.9 Plot results from eNetExplorer with DTI predicting each cognitive task

enet_interval_plot_DTI(data_input=enet_CFA_fit,tuning_input=enet_CFA,resp_input = CFA_Resp_Var, plotting_names = CFA_var_plotting)
## $Lang_first_order

## 
## $CogFlex_first_order

## 
## $MemRe_first_order

## 
## $g_second_order

3.2.10 Plot brain images from eNetExplorer for tasks and sMRI predicting G-Factor

    library(ggseg)
    library(ggsegExtra)
## Loading required package: ggseg3d
## Registered S3 method overwritten by 'geomorph':
##   method    from   
##   print.pls parsnip
    library(ggsegDesterieux)

    
    modality_type_vec <- list("Nback","SST","MID","smri")
    # modality_word_vec <- list("NBack-fMRI", "SST-fMRI","MID-fMRI","sMRI")
    modality_type_word_vec <- list(c("Nback","NBack-fMRI"),
                                   c("SST","SST-fMRI"),
                                   c("MID","MID-fMRI"),
                                   c("smri","sMRI"))

    str_type_vec <- list("X2backvs0back_ROI_",
                         "anystopvscorrectgo_ROI_",
                         "antiRewVsNeu_ROI_",
                         c("ASEG_","Dest_Thick_")) 
    
    #CFA_Resp_vec <- c("Lang_first_order", "CogFlex_first_order", "MemRe_first_order", "g_second_order")
    
    brainPrepTibFunc <- function(enet_CFA_fit, 
                                 modality_type, 
                                 str_type, 
                                 CFA_Resp) { 
      
      if (modality_type[1] == "smri") {
        brainPlotTib <- extract_tibble(enet_CFA_fit[[modality_type[1]]][[CFA_Resp]], 
                                       alpha_index = paste0("a",
                                                            enet_CFA_xplorer[[modality_type[1]]][[CFA_Resp]][["param_tune"]][["mixture"]])) %>%
          filter(type == "Target") %>% 
          mutate(.,label = str_replace_all(variable, str_type[1], '')) %>%
          mutate(.,label = str_replace_all(label, str_type[2], '')) %>%
          mutate(.,label = str_replace_all(label, '\\.', '-')) %>%
          mutate(.,label = str_replace_all(label, 'Brain-Stem', 'brain-stem')) %>% 
          mutate(.,label = str_replace_all(label, 'Right-Cerebellum-Cortex', 'right-cerebellum-cortex'))
        }
      else{
      brainPlotTib <- extract_tibble(enet_CFA_fit[[modality_type[1]]][[CFA_Resp]], 
                     alpha_index = paste0("a",
                                          enet_CFA_xplorer[[modality_type[1]]][[CFA_Resp]][["param_tune"]][["mixture"]])) %>%
      filter(type == "Target") %>% 
      mutate(.,label = str_replace_all(variable, str_type, '')) %>%
      mutate(.,label = str_replace_all(label, '\\.', '-')) %>%
      mutate(.,label = str_replace_all(label, 'Brain-Stem', 'brain-stem')) %>% 
      mutate(.,label = str_replace_all(label, 'Right-Cerebellum-Cortex', 'right-cerebellum-cortex'))
      }
      
      coefs_ggsegDes_all <- 
        ggsegDesterieux::desterieux %>% as_tibble %>% 
        select(label) %>% 
        na.omit() %>% 
        left_join(select(brainPlotTib, 
                         label, 
                         wmean, 
                         pvalue,
                         variable), by = "label")
      
      coefs_ggsegAseg_all <- 
        ggseg::aseg$data %>% as_tibble %>% 
        select(label) %>% 
        na.omit() %>% 
        left_join(select(brainPlotTib, 
                         label, 
                         wmean, 
                         pvalue,
                         variable), by = "label") 
      
      coefs_ggsegMax <- max(max(coefs_ggsegDes_all$wmean, na.rm = TRUE),
                            max(coefs_ggsegAseg_all$wmean, na.rm = TRUE))
      coefs_ggsegMin <- min(min(coefs_ggsegDes_all$wmean, na.rm = TRUE),
                            min(coefs_ggsegAseg_all$wmean, na.rm = TRUE))
      
      plot_ggsegDes_all <- 
        ggsegDesterieux::desterieux %>% as_tibble %>% 
        right_join(coefs_ggsegDes_all, by ="label") %>%
        select(label,  wmean, pvalue) %>% filter(pvalue <.05) %>%
        ggseg(atlas = 'desterieux', 
              mapping = aes(fill = wmean),
              colour="black"
        ) + 
        theme_void() +
        scale_fill_gradient2(
          limits = c(coefs_ggsegMin, coefs_ggsegMax), 
          midpoint = 0, low = "blue", mid = "white",
          high = "red", space = "Lab", na.value="transparent" ) +
        theme(legend.position = "none")+
        labs(title =paste0(modality_type[2])) 
      
      plot_ggsegAseg_all <- 
      ggseg::aseg$data %>% as_tibble %>% 
        right_join(coefs_ggsegAseg_all, by ="label") %>%
        select(label,  wmean, pvalue) %>% filter(pvalue <.05) %>%
        ggseg(atlas="aseg", 
              mapping=aes(fill=wmean),
              view = "axial",
              colour="black") +
        theme_void() +
        scale_fill_gradient2(
              limits = c(coefs_ggsegMin, coefs_ggsegMax),
          midpoint = 0, low = "blue", mid = "white",
          high = "red", space = "Lab", na.value="transparent" ) +
        guides(fill = guide_colourbar(barwidth = 0.5, barheight = 3, title = NULL))
        theme(legend.position = "none")
      
      plotPreptaskfMRI <- 
        list(brainPlotTib,
             coefs_ggsegDes_all, 
             coefs_ggsegAseg_all,
             coefs_ggsegMax,
             coefs_ggsegMin,
             plot_ggsegDes_all,
             plot_ggsegAseg_all) %>% 
        purrr::set_names("brainPlotTib","coefs_ggsegDes_all", "coefs_ggsegAseg_all",
                         "coefs_ggsegMax","coefs_ggsegMin","plot_ggsegDes_all", "plot_ggsegAseg_all")
      
      return(plotPreptaskfMRI)
    }
    
    plotPreptaskfMRI  <-
      modality_type_word_vec %>%
      purrr::set_names(purrr::map(.,1)) %>% 
      purrr::map2(.x = ., 
                 .y = str_type_vec,
         ~  brainPrepTibFunc(enet_CFA_fit = enet_CFA_fit,
                             modality_type = .x,
                           str_type = .y,
                           CFA_Resp = "g_second_order"))
## merging atlas and data by 'label'
## merging atlas and data by 'label'
## Warning: Some data not merged properly. Check for naming errors in data:
##   atlas type  hemi  side  region label                    wmean  pvalue
##   <chr> <chr> <chr> <chr> <chr>  <chr>                    <dbl>   <dbl>
## 1 <NA>  <NA>  <NA>  <NA>  <NA>   right-cerebellum-cortex 0.0301 0.00120
## merging atlas and data by 'label'
## merging atlas and data by 'label'
## merging atlas and data by 'label'
## merging atlas and data by 'label'
## merging atlas and data by 'label'
## merging atlas and data by 'label'
    ggpubr::ggarrange(plotlist=purrr::map(plotPreptaskfMRI,"plot_ggsegDes_all")
              ,nrow=4)

    # ggpubr::ggarrange(plotlist=purrr::map(plotPreptaskfMRI,"plot_ggsegAs_all")
    #                   ,nrow=4)
    #               
    modality_type_vec %>% map(.,~gridExtra::grid.arrange(plotPreptaskfMRI[[.]][["plot_ggsegDes_all"]],
                                                         plotPreptaskfMRI[[.]][["plot_ggsegAseg_all"]],
                                                         nrow = 1, ncol = 2, widths = c(4, 1.5)))

## [[1]]
## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 
## [[2]]
## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 
## [[3]]
## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 
## [[4]]
## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]

3.2.11 Plot brain images from eNetExplorer for DTI predicting G-Factor

  library(ggsegJHU)
    
    HaglerToJhu <- read_csv(paste0(utilFold, "HaglerToJhu.csv"))
## Rows: 44 Columns: 5
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): hemi, region, side, label, Hagler
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
    DTITrib <- extract_tibble(enet_CFA_fit[["DTI"]][["g_second_order"]], 
                   alpha_index = paste0("a", enet_CFA_xplorer[["DTI"]][["g_second_order"]][["param_tune"]][["mixture"]])) %>% 
      filter(type == "Target") %>%  
      mutate(.,Hagler = str_replace_all(variable, "FA_", '')) 
   
    ggsegJHU::jhu %>% as_tibble %>% 
      left_join(HaglerToJhu, by = c("hemi","region","side","label")) %>% 
      left_join(DTITrib %>% select(-type), by = "Hagler") %>% 
      filter(pvalue <.05) %>%
      select( -geometry) %>%
      ggseg(atlas = 'jhu', 
            mapping = aes(fill = wmean),
            colour="black") + 
      theme_void() +
    theme(text = element_text(size=30)) + 
    labs(title = "DTI") +
      scale_fill_gradient2(
#        limits = c(coefs_ggsegMin, coefs_ggsegMax),
        midpoint = 0, low = "blue", mid = "white",
        high = "red", space = "Lab", na.value="transparent" ) +
      guides(fill = guide_colourbar(barwidth = 1.5, barheight = 9, title = NULL))
## merging atlas and data by 'atlas', 'type', 'hemi', 'region', 'side', 'label'

    theme(legend.position = "none") 
## List of 1
##  $ legend.position: chr "none"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

3.3 Combine the predicted values with Random Forest Oppornistic Stacking for factor scores

3.3.1 Random Forest Data preparation for factor scores

enet_rf_valid_CFA <- CFA_Resp_Var %>% 
  map(.,~rnd_for_data_prep(pred_select = "predicted",
                           data_list = enet_CFA, 
                           resp_chose = .,
                           resp_data = valid_CFA))

enet_rf_test_baseline_CFA <- CFA_Resp_Var %>% 
  map(.,~rnd_for_data_prep(pred_select="predicted_baseline", 
                           data_list = enet_CFA, 
                           resp_chose = .,
                           resp_data = baseline_CFA))

enet_rf_test_followup_CFA <- CFA_Resp_Var %>% 
  map(.,~rnd_for_data_prep(pred_select="predicted_followup", 
                           data_list = enet_CFA, 
                           resp_chose = .,
                           resp_data = followup_CFA))


CFA_stack_valid <- CFA_Resp_Var %>% 
  map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_valid_CFA))%>%
  do.call(rbind,.)%>%
  mutate(number_valid = datasplit_number)%>%
  subset(.,select=-c(datasplit_number))

CFA_stack_baseline <- CFA_Resp_Var %>% 
  map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_test_baseline_CFA))%>%
  do.call(rbind,.)%>%
  mutate(baseline_test = datasplit_number)%>%
  subset(.,select=-c(datasplit_number))


CFA_stack_followup <- CFA_Resp_Var %>% 
  map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_test_followup_CFA))%>%
  do.call(rbind,.)%>%
  mutate(followup_test = datasplit_number)%>%
  subset(.,select=-c(datasplit_number))

plyr::join_all(list(CFA_stack_valid,CFA_stack_baseline,CFA_stack_followup,CFA_var_plotting), 
                 type="left", by = "response")%>%
  select("short_name","number_valid",ends_with("test"))%>%
  kableExtra::kbl(caption = paste0("Sample size of all the data sets in stacked models ")) %>%
   kableExtra::kable_classic(full_width = F, html_font = "Cambria")
Sample size of all the data sets in stacked models
short_name number_valid baseline_test followup_test
Language 2740 5108 5119
Mental Flexibility 2740 5108 5119
Memory Recall 2740 5108 5119
G-Factor 2740 5108 5119

3.3.2 random forest model fit for factor scores

##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)

rf_fit_CFA <- CFA_Resp_Var %>% 
  future_map(.,~random_forest_tuning(resp=.,
                                     valid_list = enet_rf_valid_CFA,
                                     baseline_test = enet_rf_test_baseline_CFA,
                                     followup_test = enet_rf_test_followup_CFA),
             .options = furrr_options(seed=TRUE),
             .progress = TRUE)

3.3.4 Plot conditional permutated variable importance of the random forest stacking model for each factor score

enet_rf_valid_CFA_permu <- enet_rf_valid_CFA %>% map(.,function(data_input=.){
 data_output<- data_input[["NoID"]] %>% 
   dplyr::rename("MID_large"="MID_pred.x",
                 "SST_large"="SST_pred.x",
                 "smri_large"="smri_pred.x",
                 "rsmri_large"="rsmri_pred.x",
                 "DTI_large"="DTI_pred.x",
                 "Nback_large"="Nback_pred.x",
                 "MID_small"="MID_pred.y",
                 "SST_small"="SST_pred.y",
                 "smri_small"="smri_pred.y",
                 "rsmri_small"="rsmri_pred.y",
                 "DTI_small"="DTI_pred.y",
                 "Nback_small"="Nback_pred.y")
 return(data_output)
  })

cf_fit_CFA <- CFA_Resp_Var %>% map(.,function(resp_var=.){
  cforest(as.formula(paste0(resp_var, "~ .")),
          data=enet_rf_valid_CFA_permu[[resp_var]],
          control= cforest_unbiased(mtry=rf_fit_CFA[[resp_var]][["param_tune"]]$mtry,
                                    ntree=1000,
                                    minsplit=rf_fit_CFA[[resp_var]][["param_tune"]]$min_n))
}) 

PI_permimp_CFA <-cf_fit_CFA %>% 
  map(.,~permimp(.,progressBar = FALSE)) 

CFA_Resp_Var %>% 
  map(.,~plot(PI_permimp_CFA[[.]], type = "box", 
              horizontal = TRUE,
              main=paste0("Conditional Permutation Importance ",
                          CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]])))

## $Lang_first_order
## NULL
## 
## $CogFlex_first_order
## NULL
## 
## $MemRe_first_order
## NULL
## 
## $g_second_order
## NULL
CFA_Resp_Var %>% 
  map(.,~plot(PI_permimp_CFA[[.]], 
              type = "bar", 
              interval="quantile",
              main=paste0("Conditional Permutation Importance ",
                          CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]])))

## $Lang_first_order
## NULL
## 
## $CogFlex_first_order
## NULL
## 
## $MemRe_first_order
## NULL
## 
## $g_second_order
## NULL

3.3.5 Compute shapley values for the G-Factor

3.3.5.1 shapley computing function

library("fastshap")
## 
## Attaching package: 'fastshap'
## The following object is masked from 'package:vip':
## 
##     gen_friedman
## The following object is masked from 'package:dplyr':
## 
##     explain
##need to test this works for all the models
model_pred_fun <- function(object, newdata) {
  pred_results <- predict(object, new_data = newdata)
return(pred_results$.pred)
  }  


model_shapley <- function(fit_input,data_input,resp_input){
   
library(doFuture)
registerDoFuture()
plan(multisession(workers = 25))
## because of the false alarm in plyr running this chunk would give
doRNG::registerDoRNG()  

 model_shap <- fit_input %>% 
   fastshap::explain(X = data_input%>%
                       select(-resp_input)
                     %>% as.data.frame(),nsim= 1000, pred_wrapper =model_pred_fun
                     ,.parallel=TRUE
                     )
  
return(model_shap)
}

3.3.5.2 extract model fit and training data from the previous codes

gfactor_random_forest_train_data <- enet_rf_valid_CFA$g_second_order$NoID
  
gfactor_random_forest_train_ID <- enet_rf_valid_CFA$g_second_order$ID$SUBJECTKEY

gfactor_random_forest_fit <- rf_fit_CFA$g_second_order$model_fit

3.3.5.3 compute shapley values

shapley_gractor <- model_shapley(fit_input= gfactor_random_forest_fit, 
                                 data_input = gfactor_random_forest_train_data,
                                 resp_input = CFA_Resp_Var[4])

3.3.5.4 shapley summary plot function

# The input of this function should have exactly 4 variables
# SUBJECTKEY that is subject ID
# The names of modality ie MID, smri, rsmri, nback, sst and dti
# shapley values
# feature values
shapley_summary_plot <- function(data_input, list_val_input){
shap_plot <-   data_input%>%
 # filter(var_names %in% vars_keep)%>%
  mutate(modality_names = fct_reorder(modality_names, shapley_values,.fun = "max"))%>%
ggplot(aes(x = modality_names, y = shapley_values, color = feature_values)) +
    coord_flip(ylim = range(data_input$shapley_values)*1.1) +
    geom_hline(yintercept = 0) + # the y-axis beneath
    # sina plot:
    ggforce::geom_sina(method = "counts", maxwidth = 0.7, alpha = 0.7)+
    scale_color_gradient(low="blue", high="red",
                         breaks=unlist(list_val_input), labels=c('min','med','max'),
guide = guide_colorbar(direction = "horizontal",barwidth = 12, barheight = 0.3))+
    theme_bw() +
    theme(axis.line.y = element_blank(),
          axis.ticks.y = element_blank(), # remove axis line
          legend.position="bottom",
          legend.direction = 'horizontal',
          legend.title=element_text(size=20),
          legend.text=element_text(size=20),
          legend.box = "horizontal",
          axis.title.x= element_text(size = 20),
          axis.text.y = element_text(size = 20),
          axis.text.x = element_text(size = 20)) +
    labs(y = "SHAP value (impact on model output)", x = "", color = "Feature value  ")
return(shap_plot)
}





shapley_plot_list <- function(shapley_input, data_input, ID_input){
  ##processing shapley output
  shapley_x_wider <- shapley_input%>%tibble::as_tibble()%>%
    dplyr::select(ends_with(".x"))%>%
    rename(MID = MID_pred.x,rs_fMRI= rsmri_pred.x, SST= SST_pred.x, 
           sMRI= smri_pred.x, NBack = Nback_pred.x, DTI = DTI_pred.x)%>%
    mutate(SUBJECTKEY=ID_input)
 
  
  shapley_x<-  shapley_x_wider%>%
    pivot_longer(-SUBJECTKEY, names_to = "modality_names", values_to = "shapley_values")
  
  
  data_x <- data_input%>%tibble::as_tibble()%>%
    dplyr::select(ends_with(".x"))%>%
    rename(MID = MID_pred.x,rs_fMRI= rsmri_pred.x, SST= SST_pred.x, 
           sMRI= smri_pred.x, NBack = Nback_pred.x, DTI = DTI_pred.x)%>%
    mutate(SUBJECTKEY=ID_input)%>%
    pivot_longer(-SUBJECTKEY, names_to = "modality_names", values_to = "feature_values")
  
  
  shapley_y_wider <- shapley_input%>%tibble::as_tibble()%>%
    dplyr::select(ends_with(".y"))%>%
    rename(MID = MID_pred.y,rs_fMRI= rsmri_pred.y, SST= SST_pred.y, 
           sMRI= smri_pred.y, NBack = Nback_pred.y, DTI = DTI_pred.y)%>%
    mutate(SUBJECTKEY=ID_input)
  shapley_y <-  shapley_y_wider%>%
    pivot_longer(-SUBJECTKEY, names_to = "modality_names", values_to = "shapley_values")
  
  
  data_y <- data_input%>%tibble::as_tibble()%>%
    dplyr::select(ends_with(".y"))%>%
    rename(MID = MID_pred.y,rs_fMRI= rsmri_pred.y, SST= SST_pred.y, 
           sMRI= smri_pred.y, NBack = Nback_pred.y, DTI = DTI_pred.y)%>%
    mutate(SUBJECTKEY=ID_input)%>%
    pivot_longer(-SUBJECTKEY, names_to = "modality_names", values_to = "feature_values")
  
  shapley_all <- bind_rows(shapley_x,shapley_y)
  data_all <- bind_rows(data_x,data_y)
  shapley_summary_tibble <- plyr::join_all(list(shapley_all,data_all), 
                               by=c("SUBJECTKEY","modality_names"),type="full")
  
  

    ##shapley summary plot
    shapley_summary_tibble_NA <- shapley_summary_tibble %>%
                          naniar::replace_with_na(replace = list(feature_values = c(-1000,1000.000000000)))
   shapley_summary_tibble_0 <- shapley_summary_tibble
      shapley_summary_tibble_0$feature_values[which(shapley_summary_tibble_0$feature_values == 1000)] <- 0
      shapley_summary_tibble_0$feature_values[which(shapley_summary_tibble_0$feature_values == -1000)] <- 0

    
   # list of functions to calculate the values where you want your breaks
    myfuns <- list(min, median, max)
    # use this list to make a list of your breaks
    list_val <- lapply(myfuns, function(f) f(shapley_summary_tibble$feature_values)) 
    list_val_0 <- lapply(myfuns, function(f) f(shapley_summary_tibble_0$feature_values))
    
  shapley_summary_plot_1k <- shapley_summary_plot(data_input = shapley_summary_tibble,
                                               list_val_input = list_val)  
  
    shapley_summary_plot_na <- shapley_summary_plot(data_input = shapley_summary_tibble_NA,
                                                    list_val_input = list_val_0)  

  
shapley_vi <- bind_rows(shapley_x_wider,shapley_y_wider)%>%
              select(-"SUBJECTKEY") %>%
              abs()%>% colSums()
              
shapley_vi_tibble <- tibble(modality_names = names(shapley_vi),
                            shapley_vi = shapley_vi)  

shapley_vi_plot <- shapley_vi_tibble %>%  mutate(modality_names = fct_reorder(modality_names, shapley_vi,.fun = "max"))%>%
  ggplot(aes(x = modality_names, y = shapley_vi))+
                      geom_bar(stat = "identity")+
                      coord_flip()+
  labs(y = "|SHAP| variable importance", x = "") +
    theme(axis.title.x= element_text(size = 20),
          axis.title.y= element_text(size = 20),
          axis.text.y = element_text(size = 20),
          axis.text.x = element_text(size = 20))
                      

return(list(summary_plot_na = shapley_summary_plot_na,
            summary_plot_1k = shapley_summary_plot_1k, 
            importance_plot = shapley_vi_plot))
}
shapley_plot_list(shapley_input = shapley_gractor,
                      data_input = gfactor_random_forest_train_data,
                      ID_input = gfactor_random_forest_train_ID)
## Registered S3 method overwritten by 'ggforce':
##   method           from 
##   scale_type.units units
## $summary_plot_na

## 
## $summary_plot_1k

## 
## $importance_plot

3.3.6 Plot obverved against predicted value for the Random Forest Stacking model for each factor score

random_forest_plotting_followup(rf_fit =rf_fit_CFA,
                                followup_data=enet_rf_test_followup_CFA, 
                                naming_frame = CFA_var_plotting, 
                                resp_vec= CFA_Resp_Var)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

random_forest_plotting_baseline(rf_fit =rf_fit_CFA,
                                baseline_data=enet_rf_test_baseline_CFA, 
                                naming_frame =  CFA_var_plotting, 
                                resp_vec= CFA_Resp_Var)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

3.3.7 Plot obverved against predicted value for the g factor across modalities at the Baseline

modality_ploting <- c("Stacked_All","Stacked_Complete","Stacked_Best", "Stacked_NoBest", "NBack","MID", "SST","rs_fMRI","sMRI","DTI")
resp_plotting <- c("g_second_order")

predictedObserved_GfactBaseline <-
  baseline_CFA %>% select(SUBJECTKEY, g_second_order) %>% 
  # enet_rf_test_baseline_CFA$g_second_order$ID %>% #observed:
  # select(SUBJECTKEY, g_second_order) %>% 
  full_join(rf_fit_CFA$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
  rename("Stacked_All"= pred_baseline) %>%
  full_join(enet_CFA$Nback_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
  rename("NBack" = pred) %>%
  full_join(enet_CFA$SST_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
  rename("SST" = pred) %>%
  full_join(enet_CFA$MID_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
  rename("MID" = pred) %>%
  full_join(enet_CFA$rsmri_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
  rename("rs_fMRI" = pred) %>%
  full_join(enet_CFA$smri_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
  rename("sMRI" = pred) %>%
  full_join(enet_CFA$DTI_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
  rename("DTI" = pred) 

#create a column for list wise deletion and apply z score
predictedObserved_GfactBaselineZ <- predictedObserved_GfactBaseline %>% 
  na.omit() %>% 
  select(SUBJECTKEY, "Stacked_All") %>%
  rename("Stacked_Complete" = "Stacked_All") %>% 
  right_join(predictedObserved_GfactBaseline, by = "SUBJECTKEY") 
#%>%
# mutate_at(resp_plotting, ~(scale(.) %>% as.vector))

#create 1) stacked bast = a stacked column with the same subjects with nback
#create 2) stacked others = a stacked column without the nback subjects 
predictedObserved_GfactBaselineZ <-  predictedObserved_GfactBaselineZ %>% 
  select(SUBJECTKEY, "Stacked_All", "NBack") %>%
  drop_na("NBack") %>%
  rename("Stacked_Best" = "Stacked_All") %>% select(-NBack) %>% 
  right_join(predictedObserved_GfactBaselineZ, by = "SUBJECTKEY") %>% 
  mutate(Stacked_NoBest = Stacked_All)

predictedObserved_GfactBaselineZ[which(!is.na(predictedObserved_GfactBaselineZ$NBack)), 
                                       "Stacked_NoBest"] <- NA

predictedObserved_GfactBaselineZ <- 
  predictedObserved_GfactBaselineZ[, c(1, 4, 5, 3, 2, 12, 6, 7, 8, 9, 10, 11)]

plotGFact_baseline <- list()
for (modalityNum in seq(1,length(modality_ploting),1)) 
     {
      plotGFact_baseline[[modalityNum]]  <-
        ggplot(data = predictedObserved_GfactBaselineZ,
               aes(x = scale(.data[[modality_ploting[modalityNum]]]), 
                          y = scale(.data[['g_second_order']]))) +
          geom_jitter(height = 0.1, width = 0.1, size = 1, col = 'grey60') +
          geom_smooth(method = 'lm', se = FALSE, col = 'black')  +
          labs(x = NULL,
               y = NULL,
               title = paste0(modality_ploting[modalityNum],
               ' r=',round(cor(predictedObserved_GfactBaselineZ[modality_ploting[modalityNum]],
                                  predictedObserved_GfactBaselineZ$g_second_order,
                                  use = "pairwise.complete.obs"), 3))) +
               # "\nMAE=", round(
               #      yardstick::mae_vec(truth = as.numeric(unlist(predictedObserved_GfactBaselineZ[modality_ploting[modalityNum]])),
               #                    predictedObserved_GfactBaselineZ$g_second_order,
               #                    na_rm = TRUE),3)))  +
        theme(text = element_text(size=16))
    }
figure <- 
  ggpubr::ggarrange(ggpubr::ggarrange(plotlist=plotGFact_baseline[1:4]),
                  ggpubr::ggarrange(plotlist=plotGFact_baseline[5:10]),
                  nrow = 2)

  ggpubr::annotate_figure(figure,
                top = ggpubr::text_grob("Ability to Predict G-Factor at Baseline", face = "bold", size = 29),
                bottom = ggpubr::text_grob("Predicted G-Factor (Z)", size = 30),
                left = ggpubr::text_grob("Observed G-Factor (Z)", size = 30, rot = 90)
                )

3.3.8 number of (non-)missing values

naniar::vis_miss(predictedObserved_GfactBaselineZ %>% select(-SUBJECTKEY,-g_second_order)) + 
  guides(fill = ggplot2::guide_legend(reverse = TRUE)) +
  theme(
    axis.text.x=element_text(angle =- 90,hjust = 1),
        text = element_text(size=18)) 
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

colSums(!is.na(predictedObserved_GfactBaselineZ))
##       SUBJECTKEY   g_second_order      Stacked_All Stacked_Complete 
##             5487             5487             5171             1164 
##     Stacked_Best   Stacked_NoBest            NBack              SST 
##             2653             2518             2653             2791 
##              MID          rs_fMRI             sMRI              DTI 
##             3138             3839             4567             4071

3.3.9 Plot obverved against predicted value for the g factor across modalities at the Follow-Up

predictedObserved_Gfactfollowup <-
  followup_CFA %>% select(SUBJECTKEY, g_second_order) %>% 
  # enet_rf_test_baseline_CFA$g_second_order$ID %>% #observed:
  # select(SUBJECTKEY, g_second_order) %>% 
  full_join(rf_fit_CFA$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
  rename("Stacked_All"= pred_followup) %>%
  full_join(enet_CFA$Nback_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
  rename("NBack" = pred) %>%
  full_join(enet_CFA$SST_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
  rename("SST" = pred) %>%
  full_join(enet_CFA$MID_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
  rename("MID" = pred) %>%
  full_join(enet_CFA$rsmri_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
  rename("rs_fMRI" = pred) %>%
  full_join(enet_CFA$smri_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
  rename("sMRI" = pred) %>%
  full_join(enet_CFA$DTI_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
  rename("DTI" = pred) 

#create a column for list wise deletion and apply z score
predictedObserved_GfactfollowupZ <- predictedObserved_Gfactfollowup %>% 
  na.omit() %>% 
  select(SUBJECTKEY, "Stacked_All") %>%
  rename("Stacked_Complete" = "Stacked_All") %>% 
  right_join(predictedObserved_Gfactfollowup, by = "SUBJECTKEY") 
#%>%
 # mutate_at(resp_plotting, ~(scale(.) %>% as.vector))

#create 1) stacked bast = a stacked column with the same subjects with nback
#create 2) stacked others = a stacked column without the nback subjects 
predictedObserved_GfactfollowupZ <- predictedObserved_GfactfollowupZ %>% 
  select(SUBJECTKEY, "Stacked_All", "NBack") %>%
  drop_na("NBack") %>%
  rename("Stacked_Best" = "Stacked_All") %>% select(-NBack) %>% 
  right_join(predictedObserved_GfactfollowupZ, by = "SUBJECTKEY") %>% 
  mutate(Stacked_NoBest = Stacked_All)

predictedObserved_GfactfollowupZ[which(!is.na(predictedObserved_GfactfollowupZ$NBack)), 
                                       "Stacked_NoBest"] <- NA

predictedObserved_GfactfollowupZ <- 
  predictedObserved_GfactfollowupZ[, c(1, 4, 5, 3, 2, 12, 6, 7, 8, 9, 10, 11)]

plotGFact_followup <- list()
for (modalityNum in seq(1,length(modality_ploting),1)) 
     {
      plotGFact_followup[[modalityNum]]  <-
        ggplot(data = predictedObserved_GfactfollowupZ,
               aes(x = scale(.data[[modality_ploting[modalityNum]]]) , 
                          y =scale(.data[['g_second_order']]) )) +
          geom_jitter(height = 0.1, width = 0.1, size = 1, col = 'grey60') +
          geom_smooth(method = 'lm', se = FALSE, col = 'black')  +
          labs(x = NULL,
               y = NULL,
               title = paste0(modality_ploting[modalityNum],
                ' r=',round(cor(predictedObserved_GfactfollowupZ[modality_ploting[modalityNum]],
                                   predictedObserved_GfactfollowupZ$g_second_order,
                                   use = "pairwise.complete.obs"), 3)))+
               # "\nMAE=", round(
               #      yardstick::mae_vec(truth = as.numeric(unlist(predictedObserved_GfactfollowupZ[modality_ploting[modalityNum]])),
               #                    predictedObserved_GfactfollowupZ$g_second_order,
               #                    na_rm = TRUE),3)))  +
        theme(text = element_text(size=16))
    }
figure <- 
  ggpubr::ggarrange(ggpubr::ggarrange(plotlist=plotGFact_followup[1:4]),
                  ggpubr::ggarrange(plotlist=plotGFact_followup[5:10]),
                  nrow = 2)
  ggpubr::annotate_figure(figure,
                top = ggpubr::text_grob("Ability to Predict G-Factor at Follow-up", face = "bold", size = 29),
                bottom = ggpubr::text_grob("Predicted G-Factor (Z)", size = 30),
                left = ggpubr::text_grob("Observed G-Factor (Z)", size = 30, rot = 90)
                )

3.3.10 number of (non-)missing values

naniar::vis_miss(predictedObserved_GfactfollowupZ %>% select(-SUBJECTKEY,-g_second_order)) + 
  guides(fill = ggplot2::guide_legend(reverse = TRUE)) +
  theme(
    axis.text.x=element_text(angle =- 90,hjust = 1),
        text = element_text(size=20)) 

colSums(!is.na(predictedObserved_GfactfollowupZ))
##       SUBJECTKEY   g_second_order      Stacked_All Stacked_Complete 
##             5280             5280             5119             1374 
##     Stacked_Best   Stacked_NoBest            NBack              SST 
##             2996             2123             2996             2903 
##              MID          rs_fMRI             sMRI              DTI 
##             3127             4137             4569             4230

3.4 differences in g-factor between missing vs not missing

3.4.1 create data frame for mssing values

  predictedObserved_GfactBaselineZ_missing <-
  predictedObserved_GfactBaselineZ %>%
    mutate(Stacked_All_missing = is.na(Stacked_All)) %>%
    mutate(Stacked_Complete_missing = is.na(Stacked_Complete)) %>%
    mutate(Stacked_Best_missing = is.na(Stacked_Best)) %>%
    mutate(Stacked_NoBest_missing = is.na(Stacked_NoBest)) %>% 
    mutate(NBack_missing = is.na(NBack)) %>%
    mutate(SST_missing = is.na(SST)) %>%
    mutate(MID_missing = is.na(MID)) %>%
    mutate(rs_fMRI_missing = is.na(rs_fMRI)) %>%
    mutate(sMRI_missing = is.na(sMRI)) %>%
    mutate(DTI_missing = is.na(DTI)) 

  predictedObserved_GfactfollowupZ_missing <-
  predictedObserved_GfactfollowupZ %>%
    mutate(Stacked_All_missing = is.na(Stacked_All)) %>%
    mutate(Stacked_Complete_missing = is.na(Stacked_Complete)) %>%
    mutate(Stacked_Best_missing = is.na(Stacked_Best)) %>%
    mutate(Stacked_NoBest_missing = is.na(Stacked_NoBest)) %>% 
    mutate(NBack_missing = is.na(NBack)) %>%
    mutate(SST_missing = is.na(SST)) %>%
    mutate(MID_missing = is.na(MID)) %>%
    mutate(rs_fMRI_missing = is.na(rs_fMRI)) %>%
    mutate(sMRI_missing = is.na(sMRI)) %>%
    mutate(DTI_missing = is.na(DTI)) 

3.4.2 function for plotting the differences with cohen D

ttest_missing_plot <- function(missing_colume_name, missing_title, missing_response, input_df){

  ttest_formula <- as.formula(paste(missing_response, paste(missing_colume_name, collapse=" + "), sep=" ~ "))
  
  ttest_missing_gg <- input_df %>%
    ggplot(aes_string(y=missing_response, x= missing_colume_name))+
    geom_violin(trim = FALSE) + 
    stat_summary(
      fun.data = "mean_sdl",  fun.args = list(mult = 2), 
      geom = "pointrange", color = "black"
      ) + 
    scale_x_discrete(labels=c("Present", "Missing")) + 
    ggpubr::stat_pvalue_manual(
      predictedObserved_GfactBaselineZ_missing %>%
        rstatix::t_test(ttest_formula) %>%
        rstatix::add_significance() %>% 
        rstatix::add_xy_position(x = missing_colume_name)) + 
    ggtitle(paste0(missing_title, "\nCohen d = ", 
          rstatix::cohens_d(data = input_df, 
                  formula = ttest_formula, 
                  var.equal = FALSE)%>% .$effsize %>% round(digits =3)))+
    theme(axis.title.x=element_blank()) + 
    theme(axis.title.y=element_blank())  
  
  return(ttest_missing_gg)
}

3.4.3 plot the differences with cohen D for baseline and follow up

missing_vars <- predictedObserved_GfactBaselineZ_missing %>% 
  select(ends_with("missing")) %>% 
  colnames() %>% 
  as.list() 

missing_names <- predictedObserved_GfactBaselineZ_missing %>% 
  select(ends_with("missing")) %>% 
  colnames() %>% 
  str_replace("_missing", "") %>%
  as.list() 

missing_response <- replicate(length(missing_names), "g_second_order") %>% as.list() 

missing_input_baseline_df <- replicate(length(missing_names), predictedObserved_GfactBaselineZ_missing, simplify = FALSE)
missing_input_followUp_df <- replicate(length(missing_names), predictedObserved_GfactfollowupZ_missing, simplify = FALSE)

ttest_missing_plot_baseline_list <- purrr::pmap(list(missing_colume_name = missing_vars,
                 missing_title = missing_names,
                 missing_response = missing_response,
                 input_df = missing_input_baseline_df),            
            ttest_missing_plot)

title_baseline.grob <- textGrob("Baseline Test", 
                   gp=gpar(fontsize=18))

ttest_missing_plot_baseline_gg <- 
  cowplot::plot_grid(plotlist=ttest_missing_plot_baseline_list,
                  nrow =5, align='vh', vjust=1, scale = 1) 

ttest_missing_plot_baseline_gg_title <- gridExtra::grid.arrange(gridExtra::arrangeGrob(ttest_missing_plot_baseline_gg, 
                                                  top = title_baseline.grob))

ttest_missing_plot_followup_list <- purrr::pmap(list(missing_colume_name = missing_vars,
                 missing_title = missing_names,
                 missing_response = missing_response,
                 input_df = missing_input_followUp_df),            
            ttest_missing_plot)

title_followup.grob <- textGrob("Follow Up Test", 
                   gp=gpar(fontsize=18))

ttest_missing_plot_followup_gg <- 
  cowplot::plot_grid(plotlist=ttest_missing_plot_followup_list,
                  nrow =5, align='vh', vjust=1, scale = 1) 

ttest_missing_plot_followup_gg_title <- gridExtra::grid.arrange(gridExtra::arrangeGrob(ttest_missing_plot_followup_gg, 
                                                  top = title_followup.grob))

3.4.4 combine the plots for the differences with cohen D for baseline and follow up

ttest_missing_plot_both_gg <- cowplot::plot_grid(ttest_missing_plot_baseline_gg_title, 
                   ttest_missing_plot_followup_gg_title,
                  nrow =1)

title_overall.grob <- textGrob(expression(paste("Differences in ",italic("g"), "-Factor(Z) As a Function of Missing Data")), 
                   gp=gpar(fontsize=20))

y_overall.grob <- textGrob(expression(paste(italic("g"), "-factor(Z)")), 
                   gp=gpar(fontsize=18), rot = 90)

ttest_missing_plot_both_gg_title <- gridExtra::grid.arrange(gridExtra::arrangeGrob(ttest_missing_plot_both_gg, 
                                                  top = title_overall.grob,
                                                  left = y_overall.grob))

3.4.5 Creating tables fore predictive performance

get_summary_stats <- function(data_input, target_input){
 data_input <- data_input %>% 
    mutate(pred_target = as.numeric(scale(data_input$g_second_order)) ) 
 
    mae_perform <- yardstick::mae(data =data_input, 
                                truth=pred_target, estimate=data_input[[target_input]])
    rmse_perform <- yardstick::rmse(data =data_input, 
                                    truth=pred_target, estimate=data_input[[target_input]])
    cor_perform <- cor(data_input$g_second_order,
                 data_input[[target_input]],
                 use = "pairwise.complete.obs")
    tradrsq_perform <- yardstick::rsq_trad(data = data_input, 
                                         truth=pred_target, 
                                         estimate=data_input[[target_input]])
return( tibble(Models=target_input, 
               Pearson_r=round(cor_perform,3),
               R_squared=round(tradrsq_perform$.estimate,3),
               MAE=round(mae_perform$.estimate,3),
               RMSE=round(rmse_perform$.estimate,3) 
               ) )
}

elem_remove <- c("SUBJECTKEY","g_second_order")
baseline_pred_vec <- setdiff(colnames(predictedObserved_GfactBaselineZ),elem_remove) 

baseline_perform <- baseline_pred_vec %>% 
  map(.,~get_summary_stats(data_input = predictedObserved_GfactBaselineZ,
                           target_input = .))%>% 
  do.call(rbind,.)%>%
  kableExtra::kbl(caption = paste0("Predictive Performance for the baseline test set")) %>%
   kableExtra::kable_classic(full_width = F, html_font = "Cambria") %>%
   print()
Predictive Performance for the baseline test set
Models Pearson_r R_squared MAE RMSE
Stacked_All 0.439 0.191 0.699 0.895
Stacked_Complete 0.429 0.183 0.610 0.780
Stacked_Best 0.442 0.195 0.620 0.798
Stacked_NoBest 0.296 0.085 0.783 0.987
NBack 0.402 0.072 0.664 0.857
SST 0.129 -0.033 0.744 0.950
MID 0.202 0.013 0.738 0.944
rs_fMRI 0.233 0.042 0.749 0.955
sMRI 0.208 0.040 0.763 0.969
DTI 0.190 0.033 0.757 0.972
followup_pred_vec <- setdiff(colnames(predictedObserved_GfactfollowupZ)
                             ,elem_remove) 

followup_perform <- followup_pred_vec %>% 
  map(.,~get_summary_stats(data_input = predictedObserved_GfactfollowupZ,
                           target_input = .))%>% 
  do.call(rbind,.)%>%
  kableExtra::kbl(caption = paste0("Predictive Performance for the followup test set")) %>%
   kableExtra::kable_classic(full_width = F, html_font = "Cambria") %>%
   print()
Predictive Performance for the followup test set
Models Pearson_r R_squared MAE RMSE
Stacked_All 0.414 0.166 0.719 0.913
Stacked_Complete 0.427 0.168 0.651 0.829
Stacked_Best 0.438 0.175 0.666 0.846
Stacked_NoBest 0.317 0.100 0.794 1.000
NBack 0.383 0.118 0.687 0.875
SST 0.145 -0.004 0.760 0.961
MID 0.148 -0.003 0.757 0.955
rs_fMRI 0.251 0.055 0.754 0.954
sMRI 0.226 0.049 0.764 0.965
DTI 0.212 0.045 0.771 0.978

3.4.6 function to bootstrap the differences between Stack Best vs N Back

perfDiff = function(data, indices){
  sample = data[indices,]
  
  cor_Stack <- cor(sample$g_second_order,
                 sample$Stacked_Best,
                 use = "pairwise.complete.obs")
  
  cor_NBack <- cor(sample$g_second_order,
                 sample$NBack,
                 use = "pairwise.complete.obs")
  
  tradrsq_Stack <- yardstick::rsq_trad(data =sample, 
                              truth=g_second_order, 
                              estimate=Stacked_Best)

  tradrsq_NBack <- yardstick::rsq_trad(data =sample, 
                              truth=g_second_order, 
                              estimate=NBack)
  
  mae_Stack <- yardstick::mae(data =sample, 
                              truth=g_second_order, 
                              estimate=Stacked_Best)
  
  mae_NBack <- yardstick::mae(data =sample, 
                              truth=g_second_order, 
                              estimate=NBack)
  
  rmse_Stack <- yardstick::rmse(data =sample, 
                              truth=g_second_order, 
                              estimate=Stacked_Best)
  
  rmse_NBack <- yardstick::rmse(data =sample, 
                              truth=g_second_order, 
                              estimate=NBack)
 
  cor_StackVSNBack = cor_Stack - cor_NBack 
  tradrsq_StackVSNBack = tradrsq_Stack$.estimate - tradrsq_NBack$.estimate
  mae_StackVSNBack = mae_Stack$.estimate - mae_NBack$.estimate
  rmse_StackVSNBack = rmse_Stack$.estimate - rmse_NBack$.estimate
  
  StackVSNBack = c(cor_StackVSNBack, tradrsq_StackVSNBack, mae_StackVSNBack, rmse_StackVSNBack)
  
  return(StackVSNBack)
}

3.4.7 actualy bootstraping for the baseline

StackBestVSBaseline <- predictedObserved_GfactBaselineZ %>% 
  select(g_second_order,Stacked_Best,NBack) %>% na.omit()

set.seed(12345)
DiffResultsBaseline <- boot::boot(data = StackBestVSBaseline,
           statistic = perfDiff,
           R = 5000)

#"Bias Corrected and Accelerated"
ci_r_baseline <- boot::boot.ci(DiffResultsBaseline, index = 1, conf = 0.95, type = 'bca')
ci_rsq_baseline <-boot::boot.ci(DiffResultsBaseline, index = 2, conf = 0.95, type = 'bca')
ci_mae_baseline <- boot::boot.ci(DiffResultsBaseline, index = 3, conf = 0.95, type = 'bca')
ci_rmse_baseline <- boot::boot.ci(DiffResultsBaseline, index = 4, conf = 0.95, type = 'bca')

ci_r_baseline 
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
## 
## CALL : 
## boot::boot.ci(boot.out = DiffResultsBaseline, conf = 0.95, type = "bca", 
##     index = 1)
## 
## Intervals : 
## Level       BCa          
## 95%   ( 0.0206,  0.0585 )  
## Calculations and Intervals on Original Scale
ci_rsq_baseline 
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
## 
## CALL : 
## boot::boot.ci(boot.out = DiffResultsBaseline, conf = 0.95, type = "bca", 
##     index = 2)
## 
## Intervals : 
## Level       BCa          
## 95%   ( 0.0727,  0.1393 )  
## Calculations and Intervals on Original Scale
ci_mae_baseline 
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
## 
## CALL : 
## boot::boot.ci(boot.out = DiffResultsBaseline, conf = 0.95, type = "bca", 
##     index = 3)
## 
## Intervals : 
## Level       BCa          
## 95%   (-0.0396, -0.0157 )  
## Calculations and Intervals on Original Scale
ci_rmse_baseline 
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
## 
## CALL : 
## boot::boot.ci(boot.out = DiffResultsBaseline, conf = 0.95, type = "bca", 
##     index = 4)
## 
## Intervals : 
## Level       BCa          
## 95%   (-0.0516, -0.0270 )  
## Calculations and Intervals on Original Scale
StackBestVSfollowup <- predictedObserved_GfactfollowupZ %>% 
  select(g_second_order,Stacked_Best,NBack) %>% na.omit()

3.4.8 actualy bootstraping for the follow up

set.seed(12345)
DiffResultsfollowup <- boot::boot(data = StackBestVSfollowup,
                                  statistic = perfDiff,
                                  R = 5000)

#"Bias Corrected and Accelerated"
ci_r_followup <- boot::boot.ci(DiffResultsfollowup, index = 1, conf = 0.95, type = 'bca')
ci_rsq_followup <-boot::boot.ci(DiffResultsfollowup, index = 2, conf = 0.95, type = 'bca')
ci_mae_followup <- boot::boot.ci(DiffResultsfollowup, index = 3, conf = 0.95, type = 'bca')
ci_rmse_followup <- boot::boot.ci(DiffResultsfollowup, index = 4, conf = 0.95, type = 'bca')

ci_r_followup 
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
## 
## CALL : 
## boot::boot.ci(boot.out = DiffResultsfollowup, conf = 0.95, type = "bca", 
##     index = 1)
## 
## Intervals : 
## Level       BCa          
## 95%   ( 0.0359,  0.0732 )  
## Calculations and Intervals on Original Scale
ci_rsq_followup 
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
## 
## CALL : 
## boot::boot.ci(boot.out = DiffResultsfollowup, conf = 0.95, type = "bca", 
##     index = 2)
## 
## Intervals : 
## Level       BCa          
## 95%   ( 0.0126,  0.0745 )  
## Calculations and Intervals on Original Scale
ci_mae_followup 
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
## 
## CALL : 
## boot::boot.ci(boot.out = DiffResultsfollowup, conf = 0.95, type = "bca", 
##     index = 3)
## 
## Intervals : 
## Level       BCa          
## 95%   (-0.0211,  0.0020 )  
## Calculations and Intervals on Original Scale
ci_rmse_followup 
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
## 
## CALL : 
## boot::boot.ci(boot.out = DiffResultsfollowup, conf = 0.95, type = "bca", 
##     index = 4)
## 
## Intervals : 
## Level       BCa          
## 95%   (-0.0294, -0.0050 )  
## Calculations and Intervals on Original Scale

3.4.9 plot bootstrapped distribution of Stacked Best > N-Back in the Baseline Test Set

r_plot_baseline <- as_tibble(DiffResultsBaseline$t[,1]) %>%
  ggplot() + geom_density(aes(x = value)) + 
  ggtitle("Pearson's r") +
  geom_vline(aes(xintercept=ci_r_baseline$bca[4]), color="red", linetype="dashed") +
  geom_vline(aes(xintercept=ci_r_baseline$bca[5]), color="red", linetype="dashed") +
    theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        text = element_text(size=20))

rsq_plot_baseline <- as_tibble(DiffResultsBaseline$t[,2]) %>%
  ggplot() + geom_density(aes(x = value)) +
  ggtitle("R squared") +
  geom_vline(aes(xintercept=ci_rsq_baseline$bca[4]), color="red", linetype="dashed") +
  geom_vline(aes(xintercept=ci_rsq_baseline$bca[5]), color="red", linetype="dashed") +
    theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        text = element_text(size=20))

mae_plot_baseline <- as_tibble(DiffResultsBaseline$t[,3]) %>%
  ggplot() + geom_density(aes(x = value)) + 
  ggtitle("MAE") +
  geom_vline(aes(xintercept=ci_mae_baseline$bca[4]), color="red", linetype="dashed") +
  geom_vline(aes(xintercept=ci_mae_baseline$bca[5]), color="red", linetype="dashed") +
    theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        text = element_text(size=20))

rmse_plot_baseline <- as_tibble(DiffResultsBaseline$t[,4]) %>%
  ggplot() + geom_density(aes(x = value)) + 
  ggtitle("RMSE") +
  geom_vline(aes(xintercept=ci_rmse_baseline$bca[4]), color="red", linetype="dashed") +
  geom_vline(aes(xintercept=ci_rmse_baseline$bca[5]), color="red", linetype="dashed") +
    theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        text = element_text(size=20))

boot_baseline <- 
  ggpubr::ggarrange(plotlist= 
                    list(r_plot_baseline, 
                         rsq_plot_baseline, 
                         mae_plot_baseline, 
                         rmse_plot_baseline)) %>% 
  ggpubr::annotate_figure(top = ggpubr::text_grob("Baseline Test Set",
                                                     size=22,
                                                     face = "bold"))

3.4.10 plot bootstrapped distribution of Stacked Best > N-Back in the followup Test Set

r_plot_followup <- as_tibble(DiffResultsfollowup$t[,1]) %>%
  ggplot() + geom_density(aes(x = value)) + 
  ggtitle("Pearson's r") +
  geom_vline(aes(xintercept=ci_r_followup$bca[4]), color="red", linetype="dashed") +
  geom_vline(aes(xintercept=ci_r_followup$bca[5]), color="red", linetype="dashed") +
  theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        text = element_text(size=20))

rsq_plot_followup <- as_tibble(DiffResultsfollowup$t[,2]) %>%
  ggplot() + geom_density(aes(x = value)) +
  ggtitle("R squared") +
  geom_vline(aes(xintercept=ci_rsq_followup$bca[4]), color="red", linetype="dashed") +
  geom_vline(aes(xintercept=ci_rsq_followup$bca[5]), color="red", linetype="dashed") +
  theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        text = element_text(size=20))

mae_plot_followup <- as_tibble(DiffResultsfollowup$t[,3]) %>%
  ggplot() + geom_density(aes(x = value)) + 
  ggtitle("MAE") +
  geom_vline(aes(xintercept=ci_mae_followup$bca[4]), color="red", linetype="dashed") +
  geom_vline(aes(xintercept=ci_mae_followup$bca[5]), color="red", linetype="dashed") +
  theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        text = element_text(size=20))

rmse_plot_followup <- as_tibble(DiffResultsfollowup$t[,4]) %>%
  ggplot() + geom_density(aes(x = value)) + 
  ggtitle("RMSE") +
  geom_vline(aes(xintercept=ci_rmse_followup$bca[4]), color="red", linetype="dashed") +
  geom_vline(aes(xintercept=ci_rmse_followup$bca[5]), color="red", linetype="dashed") +
  theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        text = element_text(size=20))

boot_followup <- ggpubr::ggarrange(plotlist= 
                                     list(r_plot_followup, rsq_plot_followup, mae_plot_followup, rmse_plot_followup)) %>% 
  ggpubr::annotate_figure(top = ggpubr::text_grob("Follow-up Test Set",
                                                     size=22,
                                                     face = "bold"))

3.4.11 combine bootstrapped distribution plots

ggpubr::ggarrange(boot_baseline, boot_followup,
                  nrow=2) %>%
  ggpubr::annotate_figure(top = 
                            ggpubr::text_grob("Bootstrapped Distribution Stacked Best > N-Back",
                                                     size=24,
                                                     face = "bold"))

4 Leave one site out cross validation

To test if the model generalise across sites, we used the baseline data, fit the model to all but one site and test the results at hold-out site.

4.1 site up the folds

After leaving one site as a test, we 50/50 divided the rest into train (i.e,. 1st layer) and validate (i.e., 2nd layer). This process then was repeated until we finished all of held-out sites.

baseline_site <- baseline_analysis %>% drop_na(SITE_ID_L) 
unique_site <- length(unique(baseline_site$SITE_ID_L))

site_vfold <- group_vfold_cv(data = baseline_site, group = "SITE_ID_L", v= unique_site)
## the following two lines take the split results and transform it into a data frame
## so that all the results can be checked.
#model_data_1 <- site_vfold$splits[[1]]%>% analysis()
#holdout__data_1 <- site_vfold$splits[[1]] %>% assessment()

all_sample <- site_vfold$id

all_site <-  all_sample %>% 
  map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) %>% 
  do.call(rbind,.)

all_site <- as.vector(all_site)
all_site_number <- str_remove_all(all_site,"site")

###divide the sampled data into train valid and test sets
#after leave one site, the rest was devided into 2: train and validate

 train_valid_split <- all_sample %>% 
   map(.,~vfold_cv(data=analysis(site_vfold$splits[[which(site_vfold$id==.)]]), 
                   v=2,repeats = 1))
 
names(train_valid_split)<- all_sample %>% 
  map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))

test_site_split <- all_sample %>% 
  map(.,~assessment(site_vfold$splits[[which(site_vfold$id==.)]]))

names(test_site_split)<- all_sample %>% 
  map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 

4.2 Fitting elastic net to the leave-one-site-out cross validation

Note, for cross site validation, we only focus on the baseline data. We didn’t use the followup data here.

4.2.1 set up the data and adjust for batch effects

enet_site_nocom <- 
  function(resp_var,
           split_train = analysis( data_two_fold$splits[[1]]),
           split_valid = assessment(data_two_fold$splits[[1]]),
           split_test=data_fold_test, 
           contrast_name){
    
              data_train <- split_train%>% 
                select(all_of(resp_var),
                       all_of(subj_info),
                       starts_with(contrast_name))%>%
                drop_na()%>% IQR_remove()
              
              data_valid <- split_valid%>% 
                select(all_of(resp_var),
                       all_of(subj_info),
                       starts_with(contrast_name))%>%
                drop_na()%>% IQR_remove()
              
              test_set <- split_test%>% 
                select(all_of(resp_var),
                       all_of(subj_info),
                       starts_with(contrast_name))%>%
                drop_na()%>% IQR_remove()
              
              ols_data_train <- 
                select(data_train,
                       resp_var, 
                       starts_with(contrast_name))
              
              ols_data_valid <- 
                select(data_valid,
                       resp_var,
                       starts_with(contrast_name))
              ols_test_set <- select(test_set,
                                     resp_var,
                                     starts_with(contrast_name))
              
    return(list(ols_data_train=ols_data_train,
              ols_data_valid=ols_data_valid,
              ols_test_set=ols_test_set,
              data_valid_id=data_valid$SUBJECTKEY,
              test_set_id=test_set$SUBJECTKEY,
              data_train_id=data_train$SUBJECTKEY))
                              }


enet_site_com <- function(resp_var,
                          split_train = analysis( data_two_fold$splits[[1]]),
                          split_valid = assessment(data_two_fold$splits[[1]]),
                          split_test=data_fold_test, 
                          measure_one,
                          measure_two){
  data_train <-split_train%>% 
    select(all_of(resp_var),
           all_of(subj_info),
           starts_with(measure_one),
           starts_with(measure_two))%>%
    drop_na()%>% IQR_remove()
 data_valid <- split_valid%>%
    select(all_of(resp_var),
           all_of(subj_info),
           starts_with(measure_one),
           starts_with(measure_two))%>%
   drop_na()%>% IQR_remove()
  test_set <- split_test%>% 
    select(all_of(resp_var),
           all_of(subj_info),
           starts_with(measure_one),
           starts_with(measure_two))%>%
    drop_na()%>% IQR_remove()
  ols_data_train <- batch_adjust(data_fold = data_train,
                                 resp = resp_var, 
                                 x=measure_one, 
                                 y=measure_two)
  ols_data_valid <- batch_adjust(data_fold = data_valid,
                                 resp = resp_var, 
                                 x=measure_one, 
                                 y= measure_two)
  ols_test_set <- select(test_set, 
                         resp_var,
                         starts_with(measure_one), 
                         starts_with(measure_two) ) 
  return(list(ols_data_train=ols_data_train,
              ols_data_valid=ols_data_valid,
              ols_test_set=ols_test_set,
              data_valid_id=data_valid$SUBJECTKEY,
              test_set_id=test_set$SUBJECTKEY,
              data_train_id=data_train$SUBJECTKEY))
}

# for models that require Combat that has one type of data. so far this was used for DTI only. This is bc DTI only one prefix ("fa")
enet_site_comOne <- function(resp_var,
                             split_train = analysis( data_two_fold$splits[[1]]),
                              split_valid = assessment(data_two_fold$splits[[1]]),
                             split_test=data_fold_test, measure_one){
  
  data_train <-split_train%>% 
    select(all_of(resp_var),
           all_of(subj_info),
           starts_with(measure_one))%>%
    drop_na()%>% 
    IQR_remove()
  
 data_valid <- split_valid%>%
    select(all_of(resp_var),
           all_of(subj_info),
           starts_with(measure_one))%>%
   drop_na()%>% 
   IQR_remove()
 
  test_set <- split_test%>% 
    select(all_of(resp_var),
           all_of(subj_info),
           starts_with(measure_one))%>%
    drop_na()%>% IQR_remove()
  ols_data_train <- batch_adjustOne(data_fold = data_train,
                                 resp = resp_var, x=measure_one)
  ols_data_valid <- batch_adjustOne(data_fold = data_valid,
                                 resp = resp_var, x=measure_one)
  ols_test_set <- select(test_set, 
                         resp_var,
                         starts_with(measure_one)) 
  return(list(ols_data_train=ols_data_train,
              ols_data_valid=ols_data_valid,
              ols_test_set=ols_test_set,
              data_valid_id=data_valid$SUBJECTKEY,
              test_set_id=test_set$SUBJECTKEY,
              data_train_id=data_train$SUBJECTKEY))
}

4.2.2 functions to prep the data and the model before the actual modeling

enet_tuning_site <- 
  function(resp_var, data_list){
     #recipe train and test data are from ols as the variables are the same
    norm_recipe <-recipe(as.formula(paste0(resp_var, "~ .")), 
                      data =data_list[["ols_data_train"]] ) %>%
                  step_dummy(all_nominal()) %>%
                  step_center(all_predictors()) %>%
                  step_scale(all_predictors())  %>% 
                  # estimate the means and standard deviations
                  prep(training = data_list[["ols_data_train"]], retain = TRUE)

## model
glmn_mod <- linear_reg(penalty = tune() ,mixture =tune()) %>% 
  set_engine("glmnet") 
set.seed(123456)
all_rs <- rsample::vfold_cv(data=data_list[["ols_data_train"]],v = 10,repeats = 1)
glmn_wfl <- workflow()%>%
  add_recipe(norm_recipe)%>%
  add_model(glmn_mod)
## penalty is lambda, mixture is alpha
## log10 transformed to get lambda between 10^-10 to 10^1
## this expands the default values (10^-10 to 10^0) from tidymodels https://github.com/tidymodels/dials/issues/140
glmn_set <- dials::parameters(penalty(range = c(-10,1), trans = log10_trans()),mixture())
## 100 levels of lambda and 11 levels of alpha (0, .1 to 1) # use 100 level as opposed to 200 to reduce the use ofram  
glmn_grid <- grid_regular(glmn_set, levels = c(100,11))
ctrl <- control_grid(save_pred = TRUE, verbose = TRUE)

glmn_tune <-tune::tune_grid(glmn_wfl,
            resamples = all_rs,
            grid = glmn_grid,
            metrics = metric_set(mae),##use traditional r square because there is no correlation
            control = ctrl)
##select the best tuned lambda and alpha
best_glmn <- select_best(glmn_tune, metric = "mae")
## use the best tunes parameters to fit the test data
glmn_wfl_final <- 
  glmn_wfl %>%
  finalize_workflow(best_glmn) %>%
  parsnip::fit(data = data_list[["ols_data_train"]])
### this is the prediction for random forest tuning
pred_glmn <- predict(glmn_wfl_final, new_data =data_list[["ols_data_valid"]])
###this is the prediction for test set
pred_test <- predict(glmn_wfl_final, new_data =data_list[["ols_test_set"]])
##compute the correlation between predicted results and the test data 
performance_glmn <- cor(pred_glmn$.pred,data_list[["ols_data_valid"]][[resp_var]])
performance_test <- cor(pred_test$.pred,data_list[["ols_test_set"]][[resp_var]])
best_glmn <- best_glmn %>% mutate(performance=performance_glmn,test_performance = performance_test)
return(list("param_tune"=best_glmn, 
            "predicted"=tibble(SUBJECTKEY=data_list[["data_valid_id"]],
                               "pred"=pred_glmn$.pred),
            "predicted_test"=tibble(SUBJECTKEY=data_list[["test_set_id"]], 
                                    "pred"=pred_test$.pred),
            "obs_train"=length(data_list[["data_train_id"]]),
            "obs_valid"=length(data_list[["data_valid_id"]])))
}

4.2.3 Combine the predicted values with Random Forest Oppornistic Stacking for each cognitive task across sites

  rnd_for_data_site = function(pred_select = "predicted",
                               data_list,site_idx,
                               resp_chose,
                               resp_data){
enet_pred <- 
  plyr::join_all(list(data_list[["MID_site"]][[resp_chose]][[site_idx]][[pred_select]]%>%
                        dplyr::rename("MID_pred"="pred"),
                      data_list[["smri_site"]][[resp_chose]][[site_idx]][[pred_select]]%>% 
                        dplyr::rename("smri_pred"="pred"),
                      data_list[["SST_site"]][[resp_chose]][[site_idx]][[pred_select]]%>% 
                        dplyr::rename("SST_pred"="pred"),
                      data_list[["rsmri_site"]][[resp_chose]][[site_idx]][[pred_select]]%>% 
                        dplyr::rename("rsmri_pred"="pred"),
                      data_list[["Nback_site"]][[resp_chose]][[site_idx]][[pred_select]]%>% 
                        dplyr::rename("Nback_pred"="pred"),
                      data_list[["DTI_site"]][[resp_chose]][[site_idx]][[pred_select]]%>% 
                        dplyr::rename("DTI_pred"="pred")),
                 by='SUBJECTKEY', type='full') %>% 
  distinct(SUBJECTKEY, .keep_all = TRUE)   

enet_pred_p <- enet_pred%>% mutate_all(~replace(., is.na(.), 1000))
enet_pred_n <- enet_pred%>% mutate_all(~replace(., is.na(.), -1000))

enet_pred_noNA <- left_join(enet_pred_p,
                            enet_pred_n,
                            by="SUBJECTKEY")
resp_data <- resp_data %>% 
  select("SUBJECTKEY",resp_chose)%>% 
  distinct(SUBJECTKEY, .keep_all = TRUE) %>%    
  drop_na()%>% 
  mutate_if(is.double, ~ (.x - mean(.x)) / sd(.x))

enet_pred_noNA <- left_join(enet_pred_noNA,resp_data,by="SUBJECTKEY")
enet_pred_noNA_noID <- select(enet_pred_noNA,-SUBJECTKEY)

return(list("ID"=enet_pred_noNA,"NoID"=enet_pred_noNA_noID))
}

4.2.4 random forest model fit for each cognitive task across sites

random_forest_site <- function(resp,valid_list,test_list,site_chose){
  ### to avoid repetition of coding a list is created with one data frame with ID and another without
rf_train_data <- valid_list[[resp]][[site_chose]]
rf_test_data <- test_list[[resp]][[site_chose]]

resp_var <- resp
##data recipe, assign "SUBJECTKEY" to ID (a class of variables neither belong to predictors nor response variable)
## do not scale the input data as 1000 and -1000 are put there on purpose
forest_rec <- recipe(as.formula(paste0(resp_var, "~ .")), data = rf_train_data[["NoID"]]) %>%
  step_dummy(all_nominal())   %>% 
    prep(training = rf_train_data[["NoID"]], retain = TRUE)
## mtry is the number of predictors to sample at each split
## min_n (the number of observations needed to keep splitting nodes)
tune_spec <- rand_forest(
  mtry = tune(),
  trees = 1000,
  min_n = tune()
) %>%
  set_mode("regression") %>%
  set_engine("ranger")
tune_wf <- workflow() %>%
  add_recipe(forest_rec) %>%
  add_model(tune_spec)
set.seed(123456)
## nested data validation 
all_rs <- rsample::vfold_cv(data=rf_train_data[["ID"]],v = 10,repeats = 1)
## automate generate grid for hyperparameters


rf_grid <- grid_regular(
  mtry(range = c(1, 12)), #12 is the highest number of predictors 
  min_n(range = c(1, 500)), #500 is enough based on earlier analyses
  levels = c(mtry = 12, min_n = 51) #mtry = 10 so that it increases one step at a time 
)


## dofuture package is imported by package furrr
doParallel::registerDoParallel()
future::plan(multisession,workers= all_cores)
tune_res <- tune_grid(
  tune_wf,
  resamples = all_rs,
  metrics = metric_set(rmse),
  grid = rf_grid
)
best_tune <- select_best(tune_res, metric = "rmse")

final_rf <- finalize_model(
  tune_spec,
  best_tune
)

forest_prep <- prep(forest_rec)
## this one fit the random forest model with the best tuned parameters
rf_fit <- final_rf %>%
  set_engine("ranger") %>%
 parsnip::fit(as.formula(paste0(resp_var, "~ .")),
    data = juice(forest_prep) 
  )
pred_rf <- predict(rf_fit,new_data = rf_test_data[["NoID"]])
performance_rf <- cor(pred_rf$.pred,rf_test_data[["ID"]][[resp]])
best_tune <- best_tune %>% mutate(performance=performance_rf)
result_list <- list("param_tune"=best_tune, "predicted"=tibble(SUBJECTKEY=rf_test_data[["ID"]]$SUBJECTKEY, "pred"=pred_rf$.pred),"model_fit"=rf_fit)
return(result_list)
}

4.2.5 extracting cross site performance for each cognitive task

change_modality_names <- function(performance_frame){
  performance_frame$modality[which(performance_frame$modality=="rsmri")]<- "rs_fMRI" 
performance_frame$modality[which(performance_frame$modality=="smri")]<- "sMRI"
performance_frame$modality[which(performance_frame$modality=="random_forest")]<- "Stacked"
performance_frame$modality[which(performance_frame$modality=="Nback")]<- "NBack"
performance_frame <- performance_frame%>%
  mutate(modality= as.factor(modality))%>%
  mutate(modality = factor(modality,levels =c ("Stacked","NBack","MID", "SST","rs_fMRI","sMRI","DTI")))
}

4.3 Leave one site out cross validation for the G factor

CFA_lav_response_site <- function(data_input = baseline_two_fold$splits[[1]], test_baseline =baseline_test){
  ## training the model and for enet parameter tuning
data_train <- analysis(data_input)%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
## random forest tuning
data_valid <- assessment(data_input)%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
##  test set for baseline observations
data_baseline <- test_baseline%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
### first order CFA model

NeuroCog1stOrder <-'
Language =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED 
CognitiveFlexibity =~ NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED 
MemoryRecall =~ NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC'

NeuroCog1stOrder.Fit <- lavaan::cfa(model = NeuroCog1stOrder, data = data_train,estimator="MLR")

### second order CFA model

NeuroCog2ndOrder <-'
Language =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED 
CognitiveFlexibity =~ NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED 
MemoryRecall =~ NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC
g =~ NA*Language + CognitiveFlexibity  + MemoryRecall #estimate the loading of GenAbi -> as opposed to using it as a marker
g ~~ 1*g #need to constrain variance to 1'

NeuroCog2ndOrder.Fit <- lavaan::cfa(model = NeuroCog2ndOrder, data = data_train,estimator="MLR")

NeuroCogSingleFactor <-'
g =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED + NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED + NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC'

NeuroCogSingleFactor.Fit <- lavaan::cfa(model = NeuroCogSingleFactor, data = data_train,estimator="MLR")

train_CFA_lav <- CFA_lav_output_processing(data_input = data_train,
                                           CFA_model1 = NeuroCog1stOrder.Fit,
                                           CFA_model2 = NeuroCog2ndOrder.Fit, 
                                           CFA_model3 = NeuroCogSingleFactor.Fit)
valid_CFA_lav <- CFA_lav_output_processing(data_input = data_valid,
                                           CFA_model1 = NeuroCog1stOrder.Fit,
                                           CFA_model2 = NeuroCog2ndOrder.Fit, 
                                           CFA_model3 = NeuroCogSingleFactor.Fit)
baseline_CFA_lav <- CFA_lav_output_processing(data_input = data_baseline,
                                              CFA_model1 = NeuroCog1stOrder.Fit,
                                              CFA_model2 = NeuroCog2ndOrder.Fit, 
                                              CFA_model3 = NeuroCogSingleFactor.Fit)

output_list <- list(enet_tuning = train_CFA_lav, RanFor_tuning = valid_CFA_lav, testing_baseline = baseline_CFA_lav)

return(list(output_list, NeuroCog1stOrder.Fit, NeuroCog2ndOrder.Fit,NeuroCogSingleFactor.Fit))
}

CFA_list_site <- all_site %>% map(.,~CFA_lav_response_site(data_input = train_valid_split[[.]][["splits"]][[1]], 
                                                           test_baseline =test_site_split[[.]])) 

names(CFA_list_site) <- all_site

CFA_resp_site <- c("g_second_order" )

4.3.1 functions for running enet across sites

Nback_cross_site_CFA <- function(resp_chosen){
 # doParallel::registerDoParallel()
### map over all sites instead of one single response variable
  
  Nback_site_data <- all_site %>% 
    map(.,~enet_site_nocom(resp_var = resp_chosen,
                           split_train = left_join(analysis(train_valid_split[[.]]$splits[[1]]),
                                                   CFA_list_site[[.]][[1]][["enet_tuning"]],
                                                   by="SUBJECTKEY")%>% 
                             distinct(SUBJECTKEY, .keep_all = TRUE)  ,
                           split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
                                                 CFA_list_site[[.]][[1]][["RanFor_tuning"]],
                                                 by="SUBJECTKEY")%>% 
                             distinct(SUBJECTKEY, .keep_all = TRUE)   ,
                           split_test = left_join(test_site_split[[.]],
                                                  CFA_list_site[[.]][[1]][["testing_baseline"]],
                                                  by="SUBJECTKEY")%>% 
                             distinct(SUBJECTKEY, .keep_all = TRUE) , 
                           contrast_name = "X2backvs0back_ROI_"))

  names(Nback_site_data) <- all_sample %>% 
    future_map (.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L),
              .options = furrr_options(seed=TRUE),
              .progress = TRUE) 

 Nback_site <- all_site %>%
   map(.,~enet_tuning_site(resp_var =resp_chosen,
                           data_list = Nback_site_data[[.]]))

names(Nback_site)<- all_sample %>% 
  map (.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 

return(Nback_site)
}

SST_cross_site_CFA <- function(resp_chosen){
#  doParallel::registerDoParallel()
### map over all sites instead of one single response variable

SST_site_data <- all_site %>% 
  map(.,~enet_site_nocom(resp_var = resp_chosen,
                         split_train = left_join(analysis(train_valid_split[[.]]$splits[[1]]),
                                                 CFA_list_site[[.]][[1]][["enet_tuning"]],
                                                 by="SUBJECTKEY")%>% 
                           distinct(SUBJECTKEY, .keep_all = TRUE)  ,
                           split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
                                                 CFA_list_site[[.]][[1]][["RanFor_tuning"]],
                                                 by="SUBJECTKEY")%>% 
                           distinct(SUBJECTKEY, .keep_all = TRUE)   ,
                           split_test = left_join(test_site_split[[.]],
                                                  CFA_list_site[[.]][[1]][["testing_baseline"]],
                                                  by="SUBJECTKEY")%>% 
                           distinct(SUBJECTKEY, .keep_all = TRUE) ,
                         contrast_name = "anystopvscorrectgo_ROI_"))

 names(SST_site_data) <- all_sample %>% 
   map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 

SST_site <- all_site %>% 
  map(.,~enet_tuning_site(resp_var =resp_chosen,
                          data_list = SST_site_data[[.]]))

 names(SST_site)<- all_sample %>% 
   map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 

return(SST_site)
}

MID_cross_site_CFA <- function(resp_chosen){
 # doParallel::registerDoParallel()
### map over all sites instead of one single response variable
MID_site_data <- all_site %>% 
  map(.,~enet_site_nocom(resp_var = resp_chosen,
                         split_train =left_join(analysis(train_valid_split[[.]]$splits[[1]]),
                                                CFA_list_site[[.]][[1]][["enet_tuning"]],
                                                by="SUBJECTKEY")%>% 
                           distinct(SUBJECTKEY, .keep_all = TRUE)  ,
                           split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
                                                 CFA_list_site[[.]][[1]][["RanFor_tuning"]],
                                                 by="SUBJECTKEY")%>% 
                           distinct(SUBJECTKEY, .keep_all = TRUE)   ,
                           split_test = left_join(test_site_split[[.]],
                                                  CFA_list_site[[.]][[1]][["testing_baseline"]],
                                                  by="SUBJECTKEY")%>% 
                           distinct(SUBJECTKEY, .keep_all = TRUE) ,
                         contrast_name = "antiRewVsNeu_ROI_")) 

names(MID_site_data)<- all_sample %>% 
  map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 

MID_site <- all_site %>% 
  map(.,~enet_tuning_site(resp_var =resp_chosen,
                          data_list = MID_site_data[[.]]))

names(MID_site) <- all_sample %>% 
  map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 

return(MID_site)
}


smri_cross_site_CFA <- function(resp_chosen){
 # doParallel::registerDoParallel()
### map over all sites instead of one single response variable
smri_site_data <- all_site %>% 
  map(.,~enet_site_com(resp_var = resp_chosen,
                       split_train = left_join(analysis(train_valid_split[[.]]$splits[[1]]),
                                               CFA_list_site[[.]][[1]][["enet_tuning"]],
                                               by="SUBJECTKEY")%>% 
                         distinct(SUBJECTKEY, .keep_all = TRUE)  ,
                           split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
                                                 CFA_list_site[[.]][[1]][["RanFor_tuning"]],
                                                 by="SUBJECTKEY")%>% 
                         distinct(SUBJECTKEY, .keep_all = TRUE)   ,
                           split_test = left_join(test_site_split[[.]],
                                                  CFA_list_site[[.]][[1]][["testing_baseline"]],
                                                  by="SUBJECTKEY")%>% 
                         distinct(SUBJECTKEY, .keep_all = TRUE) ,
                       measure_one = "ASEG_",
                       measure_two = "Dest_Thick_"))

names(smri_site_data)<- all_sample %>% 
  map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 
smri_site <- all_site %>% 
  map(.,~enet_tuning_site(resp_var =resp_chosen,
                          data_list = smri_site_data[[.]]))
names(smri_site)<- all_sample %>% 
  map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 
return(smri_site)
}


rsmri_cross_site_CFA <- function(resp_chosen){
#  doParallel::registerDoParallel()
### map over all sites instead of one single response variable
rsmri_site_data <- all_site %>% 
  map(.,~enet_site_com(resp_var = resp_chosen,
                       split_train = left_join(analysis(train_valid_split[[.]]$splits[[1]]),
                                               CFA_list_site[[.]][[1]][["enet_tuning"]],
                                               by="SUBJECTKEY")%>% 
                         distinct(SUBJECTKEY, .keep_all = TRUE)  ,
                           split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
                                                 CFA_list_site[[.]][[1]][["RanFor_tuning"]],
                                                 by="SUBJECTKEY")%>%
                         distinct(SUBJECTKEY, .keep_all = TRUE)   ,
                           split_test = left_join(test_site_split[[.]],
                                                  CFA_list_site[[.]][[1]][["testing_baseline"]],
                                                  by="SUBJECTKEY")%>% 
                         distinct(SUBJECTKEY, .keep_all = TRUE) ,
                       measure_one = "par",
                       measure_two = "Within"))

 names(rsmri_site_data)  <- all_sample %>% 
   map (.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 
 
 rsmri_site <- all_site %>% 
   map(.,~enet_tuning_site(resp_var =resp_chosen,
                           data_list = rsmri_site_data[[.]]))

names(rsmri_site) <- all_sample %>% 
  map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 

return(rsmri_site)
}


DTI_cross_site_CFA <- function(resp_chosen){
#  doParallel::registerDoParallel()
### map over all sites instead of one single response variable
DTI_site_data <-  all_site %>% 
  map(.,~enet_site_comOne(resp_var = resp_chosen,
                          split_train =
                            left_join(analysis(train_valid_split[[.]]$splits[[1]]),
                                      CFA_list_site[[.]][[1]][["enet_tuning"]],
                                      by="SUBJECTKEY")%>% 
                            distinct(SUBJECTKEY, .keep_all = TRUE)  ,
                           split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
                                                 CFA_list_site[[.]][[1]][["RanFor_tuning"]],
                                                 by="SUBJECTKEY")%>% 
                            distinct(SUBJECTKEY, .keep_all = TRUE)   ,
                           split_test = left_join(test_site_split[[.]],
                                                  CFA_list_site[[.]][[1]][["testing_baseline"]],
                                                  by="SUBJECTKEY")%>% 
                            distinct(SUBJECTKEY, .keep_all = TRUE) ,
                          measure_one = "FA_")) 
names(DTI_site_data)  <- all_sample %>%
  map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 

DTI_site <-  all_site %>% 
  map(.,~enet_tuning_site(resp_var =resp_chosen,
                          data_list = DTI_site_data[[.]]))
  
 names(DTI_site)<- all_sample %>% map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 

return(DTI_site)
}

4.3.2 Acutal enet parameter tuning for G-Factor across sites

Nback_enet_cross_site_CFA <- CFA_resp_site %>% map(.,~Nback_cross_site_CFA(resp_chosen =.))
SST_enet_cross_site_CFA <- CFA_resp_site %>% map(.,~SST_cross_site_CFA(resp_chosen =.))
MID_enet_cross_site_CFA <- CFA_resp_site %>% map(.,~MID_cross_site_CFA(resp_chosen =.))
smri_enet_cross_site_CFA <- CFA_resp_site %>% map(.,~smri_cross_site_CFA(resp_chosen =.))
rsmri_enet_cross_site_CFA <- CFA_resp_site%>% map(.,~rsmri_cross_site_CFA(resp_chosen =.))
DTI_enet_cross_site_CFA <- CFA_resp_site %>% map(.,~DTI_cross_site_CFA(resp_chosen =.))


rf_enet_cross_site_CFA <- list("Nback_site"=Nback_enet_cross_site_CFA,
                           "SST_site"=SST_enet_cross_site_CFA,
                           "MID_site"=MID_enet_cross_site_CFA,
                           "smri_site"=smri_enet_cross_site_CFA,
                           "rsmri_site"=rsmri_enet_cross_site_CFA,
                           "DTI_site"=DTI_enet_cross_site_CFA)

4.3.3 Actual random forest model fit for G-Factor across sites

all_cores <- parallel::detectCores(logical = FALSE) - 2


mri_site <- paste0(mri_vec,"_site")
for(i in 1:length(mri_site)){
  names(rf_enet_cross_site_CFA[[mri_site[i]]])<- CFA_resp_site
}

random_forest_validsite_CFA <- CFA_resp_site %>% map(.,function(resp_idx=.){
random_forest_onesite <-   all_site %>%map(.,
                                           ~rnd_for_data_site(pred_select = "predicted",
                                                              data_list=rf_enet_cross_site_CFA,site_idx = .,
                                                              resp_chose=resp_idx,
                                                              resp_data=CFA_list_site[[.]][[1]][["RanFor_tuning"]]) ) 
names(random_forest_onesite) <- all_site
return(random_forest_onesite)
})
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(resp_chose)` instead of `resp_chose` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
names(random_forest_validsite_CFA) <- CFA_resp_site

random_forest_testsite_CFA <- CFA_resp_site %>% map(.,function(resp_idx=.){
random_forest_onesite <-   all_site %>%
  map(.,~rnd_for_data_site(pred_select = "predicted_test",
                           data_list=rf_enet_cross_site_CFA,
                           site_idx = .,
                           resp_chose=resp_idx,
                           resp_data=CFA_list_site[[.]][[1]][["testing_baseline"]])) 
names(random_forest_onesite) <- all_site
return(random_forest_onesite)
})
names(random_forest_testsite_CFA) <- CFA_resp_site
##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)
### this computation takes more than 10 cores and around 50 gb ram

random_forest_crosssite_CFA <- CFA_resp_site %>% future_map(.,function(resp_idx=.){
random_forest_oneresp  <-all_site %>% future_map(.,~random_forest_site(resp = resp_idx,
                                                                valid_list = random_forest_validsite_CFA,
                                                                test_list = random_forest_testsite_CFA,site_chose = .),
             .options = furrr_options(seed=TRUE))
names(random_forest_oneresp) <- all_site

return(random_forest_oneresp)
  },.options = furrr_options(seed=TRUE))

4.3.4 extracting cross site performance for G-Factor

### enet
modality_site_CFA <-names(rf_enet_cross_site_CFA) %>% str_remove_all("_site")
names(rf_enet_cross_site_CFA)<- modality_site_CFA

cross_site_enet_performance_CFA = function(resp_vec,data_input,modality_input,target_data){
  data_split <- data_input[[modality_input]]
  one_modality_extract <- resp_vec %>% 
    
    map(.,function(resp_input=.){
      performance_extract <- all_site %>%
        
        map(.,function(site_idx){
          performance_site <- 
            data_split[[resp_input]][[site_idx]][["param_tune"]][["test_performance"]]
          mae_frame <- left_join(data_split[[resp_input]][[site_idx]][["predicted_test"]],
                                 target_data[[site_idx]][[1]][["testing_baseline"]], 
                                 by="SUBJECTKEY")%>% 
            select("pred",all_of(resp_input) ,"SUBJECTKEY") %>%
            distinct(SUBJECTKEY, .keep_all = TRUE)%>% drop_na()
           mae_frame  <- mae_frame %>% mutate(scaled_target = as.vector(scale(mae_frame[[resp_input]])) )
          ## the data has to be referenced like this to be put into this function
          mae_perform <- yardstick::mae(data = mae_frame,
                                        truth=scaled_target,
                                        estimate=pred)
          rmse_perform <- yardstick::rmse(data = mae_frame,
                                          truth=scaled_target,
                                          estimate=pred)
          tradRsq_perform <- yardstick::rsq_trad(data = mae_frame,
                                                 truth=scaled_target,
                                                 estimate=pred)  
          
          combined_results <- tibble(performance = as.numeric(performance_site),
                                     site=site_idx,
                                     modality=modality_input,
                                     response=resp_input,
                                     mae_performance=mae_perform$.estimate,
                                     rmse_performance=rmse_perform$.estimate,
                                     tradRsq_performance=tradRsq_perform$.estimate
                                     ) 
          return(combined_results)
        })%>% 
        bind_rows()
    })
}

performance_enet_cross_site_CFA <- modality_vec %>% 
  map(.,~cross_site_enet_performance_CFA(resp_vec = CFA_resp_site,
                                     data_input = rf_enet_cross_site_CFA,
                                     modality_input = .,target_data =CFA_list_site))
names(performance_enet_cross_site_CFA) <- modality_vec

for(i in 1: length(modality_vec)){
  names(performance_enet_cross_site_CFA[[modality_vec[i]]])<- CFA_resp_site
}

response_enet_cross_site_CFA <- CFA_resp_site %>% 
  map(.,function(resp_input=.,
                 data_input=performance_enet_cross_site_CFA){
  one_resp_performance <-  modality_site_CFA %>% 
    map(.,function(modality_input=.){
    one_resp_modality <- data_input[[modality_input]][[resp_input]]
    return(one_resp_modality)
  })%>% bind_rows()
  
 one_resp_performance_test<- one_resp_performance 
 
return(one_resp_performance_test)
})
names(response_enet_cross_site_CFA) <- CFA_resp_site

### random forest

names(random_forest_crosssite_CFA)<- CFA_resp_site

cross_site_rf_performance_CFA = function(resp_idx,data_input=random_forest_crosssite,
                                     plotting_frame=resp_var_plotting,target_data=test_site_split){
  site_performance <- all_site %>% map(.,function(site_idx){
    one_site <- data_input[[resp_idx]][[site_idx]][["param_tune"]][["performance"]]
    
    mae_frame <- left_join(data_input[[resp_idx]][[site_idx]][["predicted"]],
                           target_data[[site_idx]][[1]][["testing_baseline"]], 
                           by="SUBJECTKEY")%>% select("pred",all_of(resp_idx) ,"SUBJECTKEY") %>%
      distinct(SUBJECTKEY, .keep_all = TRUE)%>% drop_na()
     mae_frame  <- mae_frame %>% mutate(scaled_target = as.vector(scale(mae_frame[[resp_idx]])) )
    ## the data has to be referenced like this to be put into this function
    mae_perform <- yardstick::mae(data = mae_frame,truth=scaled_target,estimate=pred)  
    rmse_perform <- yardstick::rmse(data = mae_frame,truth=scaled_target,estimate=pred)  
    tradRsq_perform <- yardstick::rsq_trad(data = mae_frame,truth=scaled_target,estimate=pred)  

    combined_results <- tibble(performance = as.numeric(one_site),
                               site=site_idx,
                               modality="random_forest",
                               response=resp_idx,
                               longer_name = plotting_frame$longer_name[which(plotting_frame$response==response)],
                               shorter_name = plotting_frame$short_name[which(plotting_frame$response==response[1])],
                               mae_performance=mae_perform$.estimate,
                               rmse_performance=rmse_perform$.estimate,
                               tradRsq_performance=tradRsq_perform$.estimate)
    return(combined_results)
  })%>% bind_rows() 
}


performance_rf_cross_site_CFA <- CFA_resp_site %>% 
  map(.,~cross_site_rf_performance_CFA(resp_idx=.,
                                       data_input = random_forest_crosssite_CFA,
                                       plotting_frame = CFA_var_plotting,
                                       target_data =CFA_list_site))
names(performance_rf_cross_site_CFA)<- CFA_resp_site
performance_all_modalities_CFA <- CFA_resp_site %>% 
  map(.,~bind_rows(performance_rf_cross_site_CFA[[.]],
                   response_enet_cross_site_CFA[[.]])%>%
        mutate(site_num=str_remove(.[['site']],"site") ) ) 
names(performance_all_modalities_CFA)<- CFA_resp_site
#modality_ploting <- c("Stacked","NBack","MID", "SST","rs_fMRI","sMRI","DTI")
performance_all_modalities_CFA <- performance_all_modalities_CFA %>% 
  map(.,~change_modality_names(performance_frame = .))
  
   
CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]], 
                            aes(site_num, performance, 
                                col = modality))+
  geom_point(key_glyph = "point", size = 3)+
    labs(x = 'Site', 
         y=NULL, 
         col = 'Model',
          title  = paste0("Out-of-site Predictive Ability (Pearson's r)\n",
                       CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]]))+
    theme(axis.text.x=element_text(size=16, angle = 45),
          axis.text.y=element_text(size=16),
          axis.title.y=element_text(size=18),
          axis.title.x=element_text(size=18),
          plot.title=element_text(size=20),
          legend.position = "right",
          legend.title=element_blank(),
          legend.text = element_text( size=18)))   

[[1]]

## Warning: Removed 3 rows containing missing values.

CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]], 
                            aes(site_num, performance_all_modalities_CFA [[.]]$mae_performance, 
                                col = modality))+
  geom_point(key_glyph = "point", size = 3)+
    labs(x = 'Site', 
         y=NULL, 
         col = 'Model',
          title  = paste0("Out-of-site Predictive Ability (MAE)\n",
                       CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]]))+
    theme(axis.text.x=element_text(size=16, angle = 45),
          axis.text.y=element_text(size=16),
          axis.title.y=element_text(size=18),
          axis.title.x=element_text(size=18),
          plot.title=element_text(size=20),
          legend.position = "right",
          legend.title=element_blank(),
          legend.text = element_text( size=18)))   

[[1]]

## Warning: Removed 3 rows containing missing values.

CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]], 
                            aes(site_num, performance_all_modalities_CFA [[.]]$rmse_performance, 
                                col = modality))+
  geom_point(key_glyph = "point", size = 3)+
    labs(x = 'Site', 
         y=NULL, 
         col = 'Model',
          title  = paste0("Out-of-site Predictive Ability (RMSE)\n",
                       CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]]))+
    theme(axis.text.x=element_text(size=16, angle = 45),
          axis.text.y=element_text(size=16),
          axis.title.y=element_text(size=18),
          axis.title.x=element_text(size=18),
          plot.title=element_text(size=20),
          legend.position = "right",
          legend.title=element_blank(),
          legend.text = element_text( size=18)))   

[[1]]

## Warning: Removed 3 rows containing missing values.

CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]], 
                            aes(site_num, performance_all_modalities_CFA [[.]]$tradRsq_performance, 
                                col = modality))+
  geom_point(key_glyph = "point", size = 3)+
    labs(x = 'Site', 
         y=NULL, 
         col = 'Model',
          title  = paste0("Out-of-site Predictive Ability (R squared)\n",
                       CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]]))+
    theme(axis.text.x=element_text(size=16, angle = 45),
          axis.text.y=element_text(size=16),
          axis.title.y=element_text(size=18),
          axis.title.x=element_text(size=18),
          plot.title=element_text(size=20),
          legend.position = "right",
          legend.title=element_blank(),
          legend.text = element_text( size=18)))   

[[1]]

## Warning: Removed 3 rows containing missing values.

### getting summary statistics 
performance_all_modalities_CFA <- performance_all_modalities_CFA %>% map(.,~mutate(.,Pearson_r =performance)%>%
                                                                           mutate(.,R_squared = tradRsq_performance)%>%
                                                                           mutate(.,MAE=mae_performance)%>%
                                                                           mutate(.,RMSE=rmse_performance))



performance_all_modalities_CFA %>% map(.,~group_by(.,modality)%>%
                                     summarise_at(c("Pearson_r","R_squared","MAE","RMSE"),
                                                  mean,na.rm=TRUE)%>%
                                       mutate_at(vars(c("Pearson_r","R_squared","MAE","RMSE")),funs(round(.,3)))%>% 
                                     kableExtra::kbl(caption = paste0("Mean of all modalities cross sites ",
                                                                      paste0("\n",resp_var_plotting$longer_name[which(resp_var_plotting$response==.)]))) %>%
                                     kableExtra::kable_classic(full_width = F, html_font = "Cambria") 
)
$g_second_order
Mean of all modalities cross sites
modality Pearson_r R_squared MAE RMSE
Stacked 0.461 0.211 0.699 0.887
NBack 0.410 0.168 0.718 0.910
MID 0.223 0.049 0.774 0.973
SST 0.136 0.017 0.784 0.989
rs_fMRI 0.261 0.068 0.763 0.964
sMRI 0.246 0.062 0.763 0.967
DTI 0.220 0.047 0.767 0.975
performance_all_modalities_CFA %>% map(.,~group_by(.,modality)%>%
                                     summarise_at(c("Pearson_r","R_squared","MAE","RMSE"),
                                                  sd,na.rm=TRUE)%>%
                                       mutate_at(vars(c("Pearson_r","R_squared","MAE","RMSE")),funs(round(.,3)))%>% 
                                     kableExtra::kbl(caption = paste0("Standard deviation of all modalities cross sites ",
                                                                      paste0("\n",resp_var_plotting$longer_name[which(resp_var_plotting$response==.)]))) %>%
                                     kableExtra::kable_classic(full_width = F, html_font = "Cambria") 
)
$g_second_order
Standard deviation of all modalities cross sites
modality Pearson_r R_squared MAE RMSE
Stacked 0.055 0.050 0.023 0.028
NBack 0.063 0.051 0.027 0.029
MID 0.093 0.044 0.019 0.023
SST 0.066 0.020 0.012 0.010
rs_fMRI 0.058 0.029 0.015 0.015
sMRI 0.088 0.043 0.022 0.023
DTI 0.071 0.035 0.017 0.018

4.3.5 Plotting cross site performance for G-Factor within one plot

corr_plot <- CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]], 
                            aes(site_num, performance, 
                                col = modality))+
  geom_point(key_glyph = "point", size = 3)+
    labs(x = NULL, 
         y=NULL, 
         col = 'Model',
          title  = paste0("Pearson's r"))+
    theme(axis.text.x=element_text(size=14, angle = 45),
          axis.text.y=element_text(size=14),
          axis.title.y=element_text(size=16),
          axis.title.x=element_text(size=16),
          plot.title=element_text(size=18),
          legend.position = "right",
          legend.title=element_blank(),
          legend.text = element_text( size=18)))   

mae_plot <- CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]], 
                            aes(site_num, performance_all_modalities_CFA [[.]]$mae_performance, 
                                col = modality))+
  geom_point(key_glyph = "point", size = 3)+
    labs(x = NULL, 
         y=NULL, 
         col = 'Model',
          title  = paste0("MAE"))+
    theme(axis.text.x=element_text(size=14, angle = 45),
          axis.text.y=element_text(size=14),
          axis.title.y=element_text(size=16),
          axis.title.x=element_text(size=16),
          plot.title=element_text(size=18),
          legend.position = "right",
          legend.title=element_blank(),
          legend.text = element_text( size=18)))   

rmse_plot <- CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]], 
                            aes(site_num, performance_all_modalities_CFA [[.]]$rmse_performance, 
                                col = modality))+
  geom_point(key_glyph = "point", size = 3)+
    labs(x = NULL, 
         y=NULL, 
         col = 'Model',
          title  = paste0("RMSE"))+
    theme(axis.text.x=element_text(size=14, angle = 45),
          axis.text.y=element_text(size=14),
          axis.title.y=element_text(size=16),
          axis.title.x=element_text(size=16),
          plot.title=element_text(size=18),
          legend.position = "right",
          legend.title=element_blank(),
          legend.text = element_text( size=18)))   

rsq_plot <- CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]], 
                            aes(site_num, performance_all_modalities_CFA [[.]]$tradRsq_performance, 
                                col = modality))+
  geom_point(key_glyph = "point", size = 3)+
    labs(x = NULL, 
         y=NULL, 
         col = 'Model',
          title  = paste0("R squared"))+
    theme(axis.text.x=element_text(size=14, angle = 45),
          axis.text.y=element_text(size=14),
          axis.title.y=element_text(size=16),
          axis.title.x=element_text(size=16),
          plot.title=element_text(size=18),
          legend.position = "right",
          legend.title=element_blank(),
          legend.text = element_text( size=18)))   

title_combined_plot <- ggdraw() + 
  draw_label(
    "Out-of-site Predictive Ability",
    fontface = 'bold',
    x = 0,
    hjust = 0,
    size=24
  ) +
  theme(
    # add margin on the left of the drawing canvas,
    # so title is aligned with left edge of first plot
    plot.margin = margin(0, 0, 0, 5)
  )
plot_combined_list <- list (corr_plot= corr_plot[[1]], 
                            rsq_plot = rsq_plot[[1]], 
                            mae_plot = mae_plot[[1]],
                            rmse_plot=rmse_plot[[1]])
combined_plot <- ggpubr::ggarrange(title_combined_plot,
                                   ggpubr::ggarrange(plotlist=plot_combined_list, 
                                                     common.legend = TRUE,
                                                     legend = "right"),
                                   nrow = 2 , heights = c(0.1, 1))
## Warning: Removed 3 rows containing missing values.
## Removed 3 rows containing missing values.
## Removed 3 rows containing missing values.
## Removed 3 rows containing missing values.
## Removed 3 rows containing missing values.
ggpubr::annotate_figure(combined_plot,bottom = ggpubr::text_grob("Site",size=15))

5 R session and library

pander::pander(sessionInfo())

R version 4.1.3 (2022-03-10)

Platform: x86_64-w64-mingw32/x64 (64-bit)

locale: LC_COLLATE=English_New Zealand.1252, LC_CTYPE=English_New Zealand.1252, LC_MONETARY=English_New Zealand.1252, LC_NUMERIC=C and LC_TIME=English_New Zealand.1252

attached base packages: stats4, grid, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: fastshap(v.0.0.7), ggsegJHU(v.1.0.1.001), ggsegDesterieux(v.1.0.1.002), ggsegExtra(v.1.5.33.004), ggseg3d(v.1.6.3), ggseg(v.1.6.4), vip(v.0.3.2), sva(v.3.42.0), BiocParallel(v.1.28.3), genefilter(v.1.76.0), mgcv(v.1.8-39), nlme(v.3.1-155), corpcor(v.1.6.10), permimp(v.1.0-2), party(v.1.3-10), strucchange(v.1.5-2), sandwich(v.3.0-1), zoo(v.1.8-10), modeltools(v.0.2-23), mvtnorm(v.1.1-3), furrr(v.0.3.0), future(v.1.25.0), plotly(v.4.10.0), cowplot(v.1.1.1), yardstick(v.0.0.9), workflowsets(v.0.2.1), workflows(v.0.2.6), tune(v.0.2.0), rsample(v.0.1.1), recipes(v.0.2.0), parsnip(v.0.2.1), modeldata(v.0.1.1), infer(v.1.0.0), dials(v.0.1.1), scales(v.1.2.0), tidymodels(v.0.2.0), pander(v.0.6.4), olsrr(v.0.5.3), glmnet(v.4.1-4), Matrix(v.1.4-0), eNetXplorer(v.1.1.3), broom(v.0.8.0), forcats(v.0.5.1), stringr(v.1.4.0), dplyr(v.1.0.8), purrr(v.0.3.4), readr(v.2.1.2), tidyr(v.1.2.0), tibble(v.3.1.6), ggplot2(v.3.3.6) and tidyverse(v.1.3.1)

loaded via a namespace (and not attached): rgl(v.0.108.3), Hmisc(v.4.6-0), svglite(v.2.1.0), class(v.7.3-20), foreach(v.1.5.2), crayon(v.1.5.1), MASS(v.7.3-55), backports(v.1.4.1), reprex(v.2.0.1), rlang(v.1.0.2), XVector(v.0.34.0), readxl(v.1.4.0), extrafontdb(v.1.0), nloptr(v.2.0.0), limma(v.3.50.3), naniar(v.0.6.1), extrafont(v.0.18), bit64(v.4.0.5), glue(v.1.6.2), parallel(v.4.1.3), oro.nifti(v.0.11.0), AnnotationDbi(v.1.56.2), BiocGenerics(v.0.40.0), classInt(v.0.4-3), haven(v.2.5.0), tidyselect(v.1.1.2), RRPP(v.1.2.3), XML(v.3.99-0.9), calibrate(v.1.7.7), sf(v.1.0-7), ggpubr(v.0.4.0), SuppDists(v.1.1-9.7), xtable(v.1.8-4), magrittr(v.2.0.3), evaluate(v.0.15), cli(v.3.2.0), zlibbioc(v.1.40.0), rstudioapi(v.0.13), sp(v.1.4-7), DiceDesign(v.1.9), bslib(v.0.3.1), rpart(v.4.1.16), pbmcapply(v.1.5.1), numform(v.0.7.0), xfun(v.0.30), cluster(v.2.1.2), caTools(v.1.18.2), KEGGREST(v.1.34.0), RNifti(v.1.4.0), expm(v.0.999-6), ape(v.5.6-2), listenv(v.0.8.0), reshape(v.0.8.9), Biostrings(v.2.62.0), png(v.0.1-7), ipred(v.0.9-12), withr(v.2.5.0), ggforce(v.0.3.3), neurobase(v.1.32.1), bitops(v.1.0-7), freesurfer(v.1.6.8), ranger(v.0.13.1), plyr(v.1.8.6), cellranger(v.1.1.0), hardhat(v.0.2.0), e1071(v.1.7-9), pROC(v.1.18.0), coda(v.0.19-4), pillar(v.1.7.0), RcppParallel(v.5.1.5), gplots(v.3.1.3), cachem(v.1.0.6), multcomp(v.1.4-19), fs(v.1.5.2), raster(v.3.5-15), geomorph(v.4.0.3), vctrs(v.0.4.0), pbivnorm(v.0.6.0), ellipsis(v.0.3.2), generics(v.0.1.2), nortest(v.1.0-4), lava(v.1.6.10), rgdal(v.1.5-31), tools(v.4.1.3), foreign(v.0.8-82), tweenr(v.1.0.2), munsell(v.0.5.0), emmeans(v.1.7.3), proxy(v.0.4-26), fastmap(v.1.1.0), compiler(v.4.1.3), abind(v.1.4-5), stars(v.0.5-5), DescTools(v.0.99.45), semPlot(v.1.1.5), GenomeInfoDbData(v.1.2.7), prodlim(v.2019.11.13), gridExtra(v.2.3), OpenMx(v.2.20.6), edgeR(v.3.36.0), lattice(v.0.20-45), utf8(v.1.2.2), jsonlite(v.1.8.0), arm(v.1.12-2), GGally(v.2.1.2), gld(v.2.6.4), pbapply(v.1.5-0), carData(v.3.0-5), estimability(v.1.3), lazyeval(v.0.2.2), car(v.3.0-13), R.utils(v.2.11.0), latticeExtra(v.0.6-29), goftest(v.1.2-3), checkmate(v.2.0.0), rmarkdown(v.2.14), openxlsx(v.4.2.5), webshot(v.0.5.3), Biobase(v.2.54.0), igraph(v.1.2.11), survival(v.3.2-13), numDeriv(v.2016.8-1.1), yaml(v.2.3.5), timeROC(v.0.4), systemfonts(v.1.0.4), survivalROC(v.1.0.3), htmltools(v.0.5.2), memoise(v.2.0.1), lavaan(v.0.6-11), locfit(v.1.5-9.5), IRanges(v.2.28.0), viridisLite(v.0.4.0), digest(v.0.6.29), assertthat(v.0.2.1), timereg(v.2.0.2), lwgeom(v.0.2-8), Rttf2pt1(v.1.3.10), units(v.0.8-0), RSQLite(v.2.2.13), future.apply(v.1.9.0), rockchalk(v.1.8.151), Exact(v.3.1), data.table(v.1.14.2), blob(v.1.2.3), R.oo(v.1.24.0), S4Vectors(v.0.32.4), lhs(v.1.1.5), splines(v.4.1.3), Formula(v.1.2-4), labeling(v.0.4.2), RCurl(v.1.98-1.6), pec(v.2022.03.06), hms(v.1.1.1), modelr(v.0.1.8), colorspace(v.2.0-3), base64enc(v.0.1-3), mnormt(v.2.0.2), survcomp(v.1.44.1), shape(v.1.4.6), tmvnsim(v.1.0-2), libcoin(v.1.0-9), nnet(v.7.3-17), sass(v.0.4.1), Rcpp(v.1.0.8), coin(v.1.4-2), GPfit(v.1.0-8), fansi(v.1.0.3), tzdb(v.0.3.0), parallelly(v.1.31.1), R6(v.2.5.1), lifecycle(v.1.0.1), rootSolve(v.1.8.2.3), zip(v.2.2.0), semTools(v.0.5-6), ggsignif(v.0.6.3), minqa(v.1.2.4), mi(v.1.0), jquerylib(v.0.1.4), qgraph(v.1.9.2), glasso(v.1.11), TH.data(v.1.1-1), RColorBrewer(v.1.1-3), iterators(v.1.0.14), gower(v.1.0.0), htmlwidgets(v.1.5.4), polyclip(v.1.10-0), GPArotation(v.2022.4-1), terra(v.1.5-21), rvest(v.1.0.2), globals(v.0.14.0), lmom(v.2.8), htmlTable(v.2.4.0), codetools(v.0.2-18), matrixStats(v.0.62.0), lubridate(v.1.8.0), randomForest(v.4.7-1), gtools(v.3.9.2), prettyunits(v.1.1.1), psych(v.2.2.3), dbplyr(v.2.1.1), R.methodsS3(v.1.8.1), GenomeInfoDb(v.1.30.1), gtable(v.0.3.0), DBI(v.1.1.2), visdat(v.0.5.3), smoothr(v.0.2.2), httr(v.1.4.3), highr(v.0.9), KernSmooth(v.2.23-20), vroom(v.1.5.7), stringi(v.1.7.6), progress(v.1.2.2), reshape2(v.1.4.4), farver(v.2.1.0), annotate(v.1.72.0), fdrtool(v.1.2.17), magick(v.2.7.3), timeDate(v.3043.102), lisrelToR(v.0.1.4), xml2(v.1.3.3), boot(v.1.3-28), kableExtra(v.1.3.4), rmeta(v.3.0), lme4(v.1.1-28), sem(v.3.1-14), kutils(v.1.70), bit(v.4.0.4), jpeg(v.0.1-9), pkgconfig(v.2.0.3), rstatix(v.0.7.0), bootstrap(v.2019.6) and knitr(v.1.39)