##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 508773 27.2    1131714 60.5   643845 34.4
## Vcells 939865  7.2    8388608 64.0  1649066 12.6

0.1 Note

Here we the ABCD release 3.0 data-set

0.2 Loading libraries

The following libraries and default settings were used during the analysis:

options(scipen = 999)

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.6     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(broom)
library(eNetXplorer)
library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.1-4

library(olsrr)

## 
## Attaching package: 'olsrr'

## The following object is masked from 'package:datasets':
## 
##     rivers

library(pander)
library(tidymodels)

## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --

## v dials        0.1.1     v rsample      0.1.1
## v infer        1.0.0     v tune         0.2.0
## v modeldata    0.1.1     v workflows    0.2.6
## v parsnip      0.2.1     v workflowsets 0.2.1
## v recipes      0.2.0     v yardstick    0.0.9

## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x Matrix::expand()  masks tidyr::expand()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x Matrix::pack()    masks tidyr::pack()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## x Matrix::unpack()  masks tidyr::unpack()
## x recipes::update() masks Matrix::update(), stats::update()
## * Search for functions across packages at https://www.tidymodels.org/find/

library("cowplot")
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:eNetXplorer':
## 
##     export

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

##parallel map
library("ggplot2")
library("furrr")

## Loading required package: future

library("party", quietly = TRUE)

## 
## Attaching package: 'modeltools'

## The following object is masked from 'package:tune':
## 
##     parameters

## The following object is masked from 'package:parsnip':
## 
##     fit

## The following object is masked from 'package:infer':
## 
##     fit

## The following object is masked from 'package:dials':
## 
##     parameters

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'strucchange'

## The following object is masked from 'package:stringr':
## 
##     boundary

library("permimp")

theme_set(theme_bw() + theme(panel.grid = element_blank()))
## parallel processing number of cores register
all_cores <- parallel::detectCores(logical = FALSE) - 2

1 Data Preparation

1.1 Load data files

We first loaded all of the relevant data files (not shown here as they refer to local directories):

1.2 load all of the data after quality control

The aim is to compute the following:
structural mri: cortical thickness and subcortical value
resting state mri: Gordon network correlations
stop signal task (SST): any stop vs correct go contrast
MID (monetary incentive): reward vs neutral contrast
Nback task mri scan: 2 back vs 0 back contrast
DTI scan: FA from Hagler atlas

###loading site and scanner information
MRIinfo <-tibble::as_tibble(read.csv(paste0(datafolder, "ABCD_MRI01_DATA_TABLE.csv"))) 
Siteinfo <-tibble::as_tibble(read.csv(paste0(datafolder, "ABCD_LT01_DATA_TABLE.csv")))
MriandSite <- left_join(MRIinfo,Siteinfo, by=c('SUBJECTKEY','EVENTNAME'))
MRIinfo  %>% count(EVENTNAME,SEX)

## # A tibble: 4 x 3
##   EVENTNAME                SEX       n
##   <chr>                    <chr> <int>
## 1 2_year_follow_up_y_arm_1 F      2617
## 2 2_year_follow_up_y_arm_1 M      3076
## 3 baseline_year_1_arm_1    F      5631
## 4 baseline_year_1_arm_1    M      6161

###loading response variables
NIH_TB <-tibble::as_tibble(read.csv(paste0(datafolder,"ABCD_TBSS01_DATA_TABLE.csv"))) 
LittleMan <-tibble::as_tibble(read.csv(paste0(datafolder,"LMTP201_DATA_TABLE.csv"))) 
Pearson <-tibble::as_tibble(read.csv(paste0(datafolder,"ABCD_PS01_DATA_TABLE.csv"))) 
NBackBeh <- tibble::as_tibble(read.csv(paste0(datafolder, "ABCD_MRINBACK02_DATA_TABLE.csv")) ) 

###loading brain scan data
smri_data <- tibble::as_tibble(read.csv( paste0(datafold,"sMRIDestAsegReadableGgsegQCedWeighted.csv")))
Nback_data <- tibble::as_tibble(read.csv(paste0(datafold, "NBackDestAsegReadableGgseg3dQCed.csv")))
MID_data <- tibble::as_tibble(read.csv(paste0(datafold, "MIDDestAsegReadableGgseg3dQCed.csv")))
SST_data <- tibble::as_tibble(read.csv(paste0(datafold, "SSTDestAsegReadableGgseg3d_QCed.csv"))) 
rsmri_data <- tibble::as_tibble(read.csv(paste0(datafold, "RSBetNetExcludedNoCOMBATABCD3.csv")))
DTI_data <- tibble::as_tibble(read.csv(paste0(datafold, "DTA_FA23Tracks.csv")))
### Note that we have not removed NA before Enet tuning to keep the most number of participants

short_names <- tibble::as_tibble(read.csv(paste0(scriptfold,"ShortNames_all.csv") ))

### load vision scan data, according to the documents some of the participants cannot see the IPad screen
vision_idx <- tibble::as_tibble(read.table(paste0(NDAfold, "abcd_svs01.txt"),header = TRUE))

1.3 missing value for DTI

site 01, 17, 19 have no DTI

MriandSite %>% select(SUBJECTKEY,EVENTNAME, SITE_ID_L) %>%
  full_join(DTI_data, by = c('SUBJECTKEY','EVENTNAME')) %>%
  select(FA_R_fornix, SITE_ID_L) %>%
  group_by(SITE_ID_L) %>%
  naniar::miss_var_summary() %>%
  arrange(desc(pct_miss))

## # A tibble: 23 x 4
## # Groups:   SITE_ID_L [23]
##    SITE_ID_L variable    n_miss pct_miss
##    <chr>     <chr>        <int>    <dbl>
##  1 site17    FA_R_fornix    890    100  
##  2 site19    FA_R_fornix    840    100  
##  3 site01    FA_R_fornix    562     99.8
##  4 site08    FA_R_fornix    113     22.6
##  5 site13    FA_R_fornix    233     21.7
##  6 site22    FA_R_fornix      7     20.6
##  7 site10    FA_R_fornix    203     17.7
##  8 site04    FA_R_fornix    209     17.4
##  9 site18    FA_R_fornix     85     15.3
## 10 site15    FA_R_fornix    100     15.2
## # ... with 13 more rows

1.4 calculate partial correlation for resting state

We 1) kept the within-network z-transformed zero correlations 2) took the means of between-network z-transformed zero correlations 3) converted these Fisher z to r and rearranged them into a correlation matrix 4) transformed this matrix to “partial” correlations with cor2pcor 5) converted them back to Fisher z

library("corpcor")
ROI_names <- c("Auditory","CinguloOpercular","CinguloParietal","Default","DorsalAttention",
               "FrontoParietal","None","RetrosplenialTemporal","SensorimotorHand","SensorimotorMouth",
               "Salience","VentralAttention")
stepwise_sum <- function(i){
  return(sum(diff_seq[1:i]))
}
diff_seq = seq(from=11, to=1, by=-1)
diff_seq2=seq(from=11,to=2)+1
dim_vector <- dim(select(rsmri_data, starts_with("avg")))[2]
dim_matrix <- (-(-1)+sqrt(1-4*1*2*(-dim_vector)))/2

step_seq <- map_dbl(.x=seq(from=1, to=11,by =1),.f=stepwise_sum)
upp_seq<- append(12,step_seq+12)
low_seq <- append(1,map_dbl(.x=seq(from=1,to=10, by=1),.f=function(.x){
  return(sum(diff_seq2[1:.x])+1)
}))%>%append(.,78)

partialcor <- function(data_split){
  cor_RS <- select(data_split, starts_with("avg"))
  cor_RS<-DescTools::FisherZInv(cor_RS[,order(names(cor_RS))])
  avg_cor_array <- array(rep(NA,dim_matrix^2*dim(cor_RS)[1]),
                         dim = c(dim_matrix,dim_matrix,dim(cor_RS)[1]))
  Par_cor_array <- array(rep(NA,dim_matrix^2*dim(cor_RS)[1]),
                         dim = c(dim_matrix,dim_matrix,dim(cor_RS)[1]))
  
  row_to_matrix <- function(j){
    target=matrix(rep(1,dim_matrix^2),nrow=dim_matrix)
    for(i in 1:length(low_seq)){
      target[(i+1):13,i]=target[i,(i+1):13]=as.numeric(cor_RS[j,low_seq[i]:upp_seq[i]])
    }
    target[12,13]=target[13,12]=as.numeric(cor_RS[j,upp_seq[i]])
    return(target)
  }
  
  avg_cor_array <-map(.x=seq(from=1,to=dim(cor_RS)[1],by=1),.f=row_to_matrix)
  Par_cor_array <- avg_cor_array %>% map(.,cor2pcor)
  
  matrix_to_row <- function(j){
    target_vec<-rep(NA,dim(cor_RS)[2])
    for(i in 1:length(upp_seq)){
      target_vec[low_seq[i]:upp_seq[i]]<-as.vector(Par_cor_array[[j]][i,(i+1):13])
    }
    return(DescTools::FisherZ(target_vec)) 
  }
  
  Par_cor_RS_list <- map(.x=seq(from=1,to=dim(cor_RS)[1],by=1),
                         .f=matrix_to_row)
  Par_cor_RS_df <- as.data.frame(do.call(rbind,Par_cor_RS_list))
  colnames(Par_cor_RS_df)<-names(cor_RS)
  
  Par_cor_RS_df <- Par_cor_RS_df%>%
    rename_at(.vars = vars(contains("avg")),
              .funs = funs(sub("avg", "par", .)))%>%
    mutate(SUBJECTKEY =data_split$SUBJECTKEY,
           EVENTNAME=data_split$EVENTNAME)
  data_return <- left_join(Par_cor_RS_df,
                           data_split, 
                           by = c("SUBJECTKEY","EVENTNAME"))%>% 
    drop_na()
  return(data_return)
}
rsmri_par_data <- partialcor(data_split = rsmri_data)

## Registered S3 method overwritten by 'DescTools':
##   method         from  
##   reorder.factor gplots

1.5 Join all the data sets

vision_idx <- vision_idx[-1,]%>% mutate(SUBJECTKEY=subjectkey)%>%
                                mutate(EVENTNAME=eventname)

data_all <- plyr::join_all(list(smri_data,
                                MID_data,
                                rsmri_par_data,
                                Nback_data,
                                SST_data,DTI_data,
                                NBackBeh,
                                Pearson,
                                LittleMan,
                                NIH_TB,
                                MriandSite,
                                vision_idx), 
                           by=c('SUBJECTKEY','EVENTNAME'), type='full')
### full join all the data sets to keep all the participants after QC. Actually smri have the most number of participants
### there are only two participants that are not in the smri data set but in other modalities.
### change integer response variables into double for later scaling and IQR
data_all$PEA_RAVLT_LD_TRIAL_VII_TC <- as.double(data_all$PEA_RAVLT_LD_TRIAL_VII_TC)
data_all$PEA_WISCV_TRS <- as.double(data_all$PEA_WISCV_TRS)

1.5.1 filter out the participants that have problems with vision

54 subjects

data_removed <- data_all %>% filter(snellen_va_y == 0 | snellen_va_y == 1 | vis_flg == 2)

removed_subj <- data_removed$SUBJECTKEY
data_all <- data_all %>% filter(!SUBJECTKEY %in% removed_subj)

length(removed_subj)

## [1] 54

1.6 set up variable names

mri_vec <- c("Nback","SST","MID","rsmri","smri","DTI")

Resp_Var <- c('TFMRI_NB_ALL_BEH_C2B_RATE',
              "NIHTBX_PICVOCAB_UNCORRECTED", 
              "NIHTBX_FLANKER_UNCORRECTED",
              "NIHTBX_PATTERN_UNCORRECTED",
              "NIHTBX_PICTURE_UNCORRECTED",
              "NIHTBX_READING_UNCORRECTED",
              "LMT_SCR_PERC_CORRECT",
              "PEA_RAVLT_LD_TRIAL_VII_TC",
             "TFMRI_SST_ALL_BEH_TOTAL_ISSRT" )

resp_var_plotting_long <- c("2-back working memory",
  "Picture vocabulary",
  "Flanker",
  "Pattern comparison processing speed",
  "Picture sequence memory",
  "Oral reading recognition",
  "Little man task correct percentage",
  "RAVLT long delay trial VII total correct",
  "Stop Signal Reaction Time (integration)"
)

resp_var_plotting_short <- c("2-back Work Mem",
                             "Pic Vocab","Flanker",
                             "Pattern Speed",
                             "Seq Memory",
                             "Reading Recog",
                             "Little Man",
                             "Audi Verbal",
                             "integration SSRT")

resp_var_plotting <- tibble("response" =Resp_Var, 
                            "longer_name"=resp_var_plotting_long,
                            "short_name"=resp_var_plotting_short)

subj_info <- c('SUBJECTKEY', 'MRI_INFO_DEVICESERIALNUMBER', 'SITE_ID_L')

resp_names <- data_all %>% select(all_of(Resp_Var))%>%
  names()%>%
  set_names()

new_shorter_names <- short_names 
new_shorter_names$roiShort[97] <- "R Subcentral"
new_shorter_names$roiShort <-  str_squish(string = new_shorter_names$roiShort)

1.7 functions for data preparation

IQR and combat (to adjust batch effects for sMRI, rsfMRI and DTI)

# IQR function to remove data with 3 IQR rule
IQR_remove <- function(data_split){
  data_split%>%
  mutate_at(vars(- all_of(subj_info)), ~ ifelse(
    .x > quantile(.x, na.rm = TRUE)[4] + 3 * IQR(.x, na.rm = TRUE) |
    .x < quantile(.x, na.rm = TRUE)[2] - 3 * IQR(.x, na.rm = TRUE), 
    NA, .x)) %>%
  drop_na() %>%
  mutate_if(is.double, ~ (.x - mean(.x)) / sd(.x)) 
}

library("sva")

## Loading required package: mgcv

## Loading required package: nlme

## 
## Attaching package: 'nlme'

## The following object is masked from 'package:dplyr':
## 
##     collapse

## This is mgcv 1.8-39. For overview type 'help("mgcv-package")'.

## Loading required package: genefilter

## 
## Attaching package: 'genefilter'

## The following objects are masked from 'package:yardstick':
## 
##     sens, spec

## The following object is masked from 'package:readr':
## 
##     spec

## Loading required package: BiocParallel

batch_adjust <- function(data_fold,resp, x,y){
  ols_data <- select(data_fold, resp,starts_with(x), starts_with(y))
  ols_matrix <- as.matrix(t(ols_data))
  dimnames(ols_matrix)<- NULL
  data_fold$SITE_ID_L <- as.factor(data_fold$SITE_ID_L)##have to be changed into factor before combat
 data_fold$SITE_ID_L <- droplevels(data_fold$SITE_ID_L)##drop the empty levels of a factor
 ols_matrix_com <- ComBat(dat = ols_matrix,batch = data_fold$SITE_ID_L)
 ols_data_com <- data.frame(t(ols_matrix_com))
 colnames(ols_data_com) <- colnames(ols_data)
 return(ols_data_com)
}

# if only one type of data is adjusted (e.g., DTI only requires FA but sMRI has two corticol and sub, rs has two within and bet networks)

batch_adjustOne <- function(data_fold,resp, x){
  ols_data <- select(data_fold, resp,starts_with(x))
  ols_matrix <- as.matrix(t(ols_data))
  dimnames(ols_matrix)<- NULL
  data_fold$SITE_ID_L <- as.factor(data_fold$SITE_ID_L)##have to be changed into factor before combat
 data_fold$SITE_ID_L <- droplevels(data_fold$SITE_ID_L)##drop the empty levels of a factor
 ols_matrix_com <- ComBat(dat = ols_matrix,batch = data_fold$SITE_ID_L)
 ols_data_com <- data.frame(t(ols_matrix_com))
 colnames(ols_data_com) <- colnames(ols_data)
 return(ols_data_com)
}

1.8 extract desired contrasts

## look at the contrast of MID and SST

contrast_vec_MID <- MID_data %>% 
  select(.,ends_with("_ROI_Left.Cerebellum.Cortex")) %>%
  colnames() %>%
  str_remove_all(.,"_ROI_Left.Cerebellum.Cortex")

contrast_vec_SST <- SST_data %>% 
  select(.,ends_with("_ROI_Left.Cerebellum.Cortex")) %>%
  colnames() %>%
  str_remove_all(.,"_ROI_Left.Cerebellum.Cortex")

## extract the contrasts for analysis
data_analysis <- data_all %>% 
  select(starts_with("X2backvs0back_ROI_"),
         starts_with("par"),
         starts_with("Within"),
         all_of(Resp_Var),
         all_of(subj_info),
         starts_with("antiRewVsNeu" ),
         starts_with("anystopvscorrectgo" ),
         starts_with("ASEG"),
         starts_with("Dest_Thick"))

1.9 look at the distribution of the tasks

note 2nd time point doesn’t have “NIHTBX_LIST_UNCORRECTED” “NIHTBX_CARDSORT_UNCORRECTED” “PEA_WISCV_TRS”

      plotRes <- Resp_Var %>% map(.,~ggplot(data = data_all, 
                      mapping = aes_string(x = data_all[[.]], color = "EVENTNAME")) +
                      geom_density() + 
                      labs(x= resp_var_plotting$short_name[which(resp_var_plotting$response==.)]) +
                      labs(y= "") +
                      scale_color_discrete(name ="",
                             breaks=c("2_year_follow_up_y_arm_1", "baseline_year_1_arm_1"),
                             labels=c("Follow Up", "Baseline")) +
                      theme_classic()) 
    

ggpubr::ggarrange(plotlist=plotRes, common.legend = TRUE)

## Warning: Removed 3040 rows containing non-finite values (stat_density).
## Removed 3040 rows containing non-finite values (stat_density).

## Warning: Removed 428 rows containing non-finite values (stat_density).

## Warning: Removed 320 rows containing non-finite values (stat_density).

## Warning: Removed 371 rows containing non-finite values (stat_density).

## Warning: Removed 343 rows containing non-finite values (stat_density).

## Warning: Removed 455 rows containing non-finite values (stat_density).

## Warning: Removed 410 rows containing non-finite values (stat_density).

## Warning: Removed 474 rows containing non-finite values (stat_density).

## Warning: Removed 6659 rows containing non-finite values (stat_density).

2 Modeling for genearlisation across times for each cognitive task

2.1 set up baseline and followup participants

First we removed sites that have fewer than 100 participants. But none has fewer than 100.

The data structure of the analysis is comprised of four parts:
1) training set (1st layer training set): predicting cognition from each modality via the elastic net,

validation set (2nd layer training set): combining predicted values from all modalities via the random forest,
test set of the baseline period,
test set of the second year follow up.

To protect data leakage, we select the subjects with the same subject key for the baseline test set and the followup test set.

site_count <- data_analysis %>% count(SITE_ID_L)

baseline_data <- data_all %>% filter(EVENTNAME == "baseline_year_1_arm_1")
followup_data <- data_all %>% filter(EVENTNAME == "2_year_follow_up_y_arm_1")

## all sites have more than 100!
baseline_analysis <- baseline_data %>% filter (SITE_ID_L != site_count$SITE_ID_L[which(site_count$n< 100)])
followup_analysis <- followup_data %>% filter (SITE_ID_L != site_count$SITE_ID_L[which(site_count$n< 100)])

subj_diff <- setdiff(baseline_analysis$SUBJECTKEY,followup_analysis$SUBJECTKEY)
baseline_trainvalid <- baseline_analysis %>% subset(.,SUBJECTKEY %in% subj_diff)
baseline_test <- baseline_analysis %>% subset(.,SUBJECTKEY %in% followup_analysis$SUBJECTKEY)

set.seed(123456)
### divide the training dataset into two
### one for enet tuning and one for random forest tuning
baseline_two_fold <-vfold_cv(data =baseline_trainvalid, v=2,repeats = 1)

# training set (1st layer training set): predicting cognition from each modality via the elastic net,  
dim(analysis(baseline_two_fold$splits[[1]]))[1]

## [1] 3041

# validation set (2nd layer training set): combining predicted values from all modalities via the random forest,  
dim(assessment(baseline_two_fold$splits[[1]]))[1]

## [1] 3042

# test set of the baseline period,   
dim(baseline_test)[1]

## [1] 5622

# test set of the second year follow up
dim(followup_analysis)[1]

## [1] 5656

2.2 Elastic net parameter tuning setup

2.2.1 set up the data and adjust for batch effects

Here we used Combat, an empirical Bayesian method, to adjust for batch effects (separately for the train and test). Because Nback, SST and MID task-based fMRI are contrast data, batch effects are negligible. And accordingly, we decided not to apply combat on the task-based fMRI, and only applied it on the resting state mri, structure mri and DTI. https://www.biorxiv.org/content/10.1101/309260v1.full.pdf

# for models that require no Combat
enet_processing_nocom <- 
  function(resp_var,
           split_train = analysis(baseline_two_fold$splits[[1]]),
           split_valid = assessment(baseline_two_fold$splits[[1]]),
           split_test = baseline_test,followup_test,contrast_name)
    {
              data_train <- split_train %>% 
                select(all_of(resp_var),
                       all_of(subj_info),
                       starts_with(contrast_name)) %>%
                drop_na() %>% 
                IQR_remove()
              
              data_valid <- split_valid %>% 
                select(all_of(resp_var),
                       all_of(subj_info),
                       starts_with(contrast_name)) %>% 
                drop_na() %>% 
                IQR_remove()
              
              baseline_test_set <- split_test %>% 
                select(all_of(resp_var),
                       all_of(subj_info),
                       starts_with(contrast_name)) %>% 
                drop_na() %>% 
                IQR_remove()
              
              followup_test_set <- followup_test %>% 
                select(all_of(resp_var),
                       all_of(subj_info),
                       starts_with(contrast_name)) %>%
                drop_na() %>% 
                IQR_remove()
              
              ols_data_train <- select(data_train, resp_var, starts_with(contrast_name))
              ols_data_valid <- select(data_valid,resp_var,starts_with(contrast_name))
              ols_test_baseline <- select(baseline_test_set,resp_var,starts_with(contrast_name))
              ols_test_followup <- select(followup_test_set,resp_var,starts_with(contrast_name))
    return(list(ols_data_train=ols_data_train,
              ols_data_valid=ols_data_valid,
              ols_test_baseline=ols_test_baseline,
              ols_test_followup=ols_test_followup,
              data_valid_id=tibble(SUBJECTKEY=data_valid$SUBJECTKEY),
              baseline_test_id=tibble(SUBJECTKEY=baseline_test_set$SUBJECTKEY),
              followup_test_id=tibble(SUBJECTKEY=followup_test_set$SUBJECTKEY),
              data_train_id=tibble(SUBJECTKEY=data_train$SUBJECTKEY)))
  }

# for models that require Combat
enet_processing_com <- 
  function(resp_var,
           split_train = analysis( baseline_two_fold$splits[[1]]),
           split_valid = assessment(baseline_two_fold$splits[[1]]),
           split_test=baseline_test,followup_test, measure_one,measure_two)
    {
    data_train <- split_train %>%
      select(all_of(resp_var),
             all_of(subj_info),
             starts_with(measure_one),
             starts_with(measure_two)) %>%
      drop_na() %>% 
      IQR_remove()
    
    data_valid <- split_valid %>%
      select(all_of(resp_var),
             all_of(subj_info),
             starts_with(measure_one),
             starts_with(measure_two)) %>% 
      drop_na()%>% 
      IQR_remove()
    
    baseline_test_set <- split_test%>%
      select(all_of(resp_var),
             all_of(subj_info),
             starts_with(measure_one),
             starts_with(measure_two))%>%
      drop_na()%>% 
      IQR_remove()
    
    followup_test_set <- followup_test%>%
      select(all_of(resp_var),
             all_of(subj_info),
             starts_with(measure_one),
             starts_with(measure_two))%>%
      drop_na()%>% 
      IQR_remove()
    
    ols_data_train <- 
      batch_adjust(data_fold = data_train,
                   resp = resp_var, 
                   x=measure_one, 
                   y=measure_two)
    
    ols_data_valid <- 
      batch_adjust(data_fold = data_valid,
                   resp = resp_var, 
                   x=measure_one, 
                   y=measure_two)
    
    ols_test_baseline <- 
      batch_adjust(data_fold = baseline_test_set,
                   resp = resp_var, 
                   x=measure_one, 
                   y=measure_two)
    
    ols_test_followup <- 
      batch_adjust(data_fold = followup_test_set,
                   resp = resp_var, 
                   x=measure_one, 
                   y=measure_two)
    
  return(list(ols_data_train=ols_data_train,
              ols_data_valid=ols_data_valid,
              ols_test_baseline=ols_test_baseline,
              ols_test_followup=ols_test_followup,
              data_valid_id=tibble(SUBJECTKEY=data_valid$SUBJECTKEY),
              baseline_test_id=tibble(SUBJECTKEY=baseline_test_set$SUBJECTKEY),
              followup_test_id=tibble(SUBJECTKEY=followup_test_set$SUBJECTKEY),
              data_train_id=tibble(SUBJECTKEY=data_train$SUBJECTKEY)))
  }

# for models that require Combat that has one type of data. so far this was used for DTI only. This is bc DTI only one prefix ("fa")
enet_processing_comOne <- 
  function(resp_var,
           split_train = analysis( baseline_two_fold$splits[[1]]),
           split_valid = assessment(baseline_two_fold$splits[[1]]),
           split_test=baseline_test,followup_test, measure_one)
    {
    data_train <- split_train %>%
      select(all_of(resp_var),
             all_of(subj_info),
             starts_with(measure_one)) %>%
      drop_na() %>% 
      IQR_remove()
    
    data_valid <- split_valid %>%
      select(all_of(resp_var),
             all_of(subj_info),
             starts_with(measure_one)) %>% 
      drop_na()%>% 
      IQR_remove()
    
    baseline_test_set <- split_test%>%
      select(all_of(resp_var),
             all_of(subj_info),
             starts_with(measure_one))%>%
      drop_na()%>% 
      IQR_remove()
    
    followup_test_set <- followup_test%>%
      select(all_of(resp_var),
             all_of(subj_info),
             starts_with(measure_one))%>%
      drop_na()%>% 
      IQR_remove()
    
    ols_data_train <- 
      batch_adjustOne(data_fold = data_train,
                   resp = resp_var, 
                   x=measure_one)
    
    ols_data_valid <- 
      batch_adjustOne(data_fold = data_valid,
                   resp = resp_var, 
                   x=measure_one)
    
    ols_test_baseline <- 
      batch_adjustOne(data_fold = baseline_test_set,
                   resp = resp_var, 
                   x=measure_one)
    
    ols_test_followup <- 
      batch_adjustOne(data_fold = followup_test_set,
                   resp = resp_var, 
                   x=measure_one)
    
  return(list(ols_data_train=ols_data_train,
              ols_data_valid=ols_data_valid,
              ols_test_baseline=ols_test_baseline,
              ols_test_followup=ols_test_followup,
              data_valid_id=tibble(SUBJECTKEY=data_valid$SUBJECTKEY),
              baseline_test_id=tibble(SUBJECTKEY=baseline_test_set$SUBJECTKEY),
              followup_test_id=tibble(SUBJECTKEY=followup_test_set$SUBJECTKEY),
              data_train_id=tibble(SUBJECTKEY=data_train$SUBJECTKEY)))
   }

2.2.2 a function for elastic net hyper parameter tuning and performance evaluation

this function will fitting enet models and to apply them the 2nd layer model and to the test sets

enet_tuning <- function(resp_var, data_list){
     #recipe train and test data are from ols as the variables are the same
norm_recipe <-recipe(as.formula(paste0(resp_var, "~ .")), 
                      data =data_list[["ols_data_train"]] ) %>%
              step_dummy(all_nominal()) %>%
              step_center(all_predictors()) %>%
              step_scale(all_predictors())  %>% 
              # estimate the means and standard deviations
              prep(training = data_list[["ols_data_train"]], retain = TRUE)
            ## model
              glmn_mod <- linear_reg(penalty = tune() ,mixture =tune()) %>% 
              set_engine("glmnet")
set.seed(123456)
# 10fold split within the training data
all_rs <- rsample::vfold_cv(data=data_list[["ols_data_train"]],
                            v = 10, repeats = 1)
glmn_wfl <- workflow() %>%
  add_recipe(norm_recipe) %>%
  add_model(glmn_mod)
## penalty is lambda, mixture is alpha
## log10 transformed to get lambda between 10^-10 to 10^1
## this expands the default values (10^-10 to 10^0) from tidymodels https://github.com/tidymodels/dials/issues/140
glmn_set <- dials::parameters(penalty(range = c(-10,1), trans = log10_trans()),mixture())
## 200 levels of lambda and 11 levels of alpha (0, .1 to 1) 
glmn_grid <- grid_regular(glmn_set, levels = c(200,11))
ctrl <- control_grid(save_pred = TRUE, verbose = TRUE)
glmn_tune <- tune_grid(glmn_wfl,
            resamples = all_rs,
            grid = glmn_grid,
            metrics = metric_set(mae),
            control = ctrl)

##select the best tuned lambda and alpha
best_glmn <- select_best(glmn_tune, metric = "mae")

## use the best tunes parameters to fit the test data
glmn_wfl_final <- 
  glmn_wfl %>%
  finalize_workflow(best_glmn) %>%
 parsnip::fit(data = data_list[["ols_data_train"]]) ##clearly another package is blocking this function

### this is the prediction for random forest tuning
pred_glmn <- predict(glmn_wfl_final, new_data =data_list[["ols_data_valid"]])

###this is the prediction for test set
pred_test_baseline <- predict(glmn_wfl_final, new_data =data_list[["ols_test_baseline"]])
pred_test_followup <- predict(glmn_wfl_final, new_data =data_list[["ols_test_followup"]])

##compute the correlation between predicted results and the test data 
performance_glmn <- cor(pred_glmn$.pred,data_list[["ols_data_valid"]][[resp_var]])
performance_baseline <- cor(pred_test_baseline$.pred,data_list[["ols_test_baseline"]][[resp_var]])
performance_followup <- cor(pred_test_followup$.pred,data_list[["ols_test_followup"]][[resp_var]])
best_glmn <- best_glmn %>% 
  mutate(performance=performance_glmn,
         baseline_performance = performance_baseline,
         followup_performance=performance_followup)

return(list("param_tune"=best_glmn, 
            "predicted"=tibble(SUBJECTKEY=data_list[["data_valid_id"]]$SUBJECTKEY, 
                               "pred"=pred_glmn$.pred),
            "predicted_baseline"=tibble(SUBJECTKEY=data_list[["baseline_test_id"]]$SUBJECTKEY, 
                                        "pred"=pred_test_baseline$.pred), 
            "predicted_followup"=tibble(SUBJECTKEY=data_list[["followup_test_id"]]$SUBJECTKEY, 
                                        "pred"=pred_test_followup$.pred),
            "obs_train"=length(data_list[["data_train_id"]]$SUBJECTKEY),
            "obs_valid"=length(data_list[["data_valid_id"]]$SUBJECTKEY)))
}

2.3 model fitting for each cognitive task

2.3.1 actual data preping for elastic net for each cognitive task

Elastic net tuning is saved together with random forest results.

Nback_enet_data <- resp_names %>% 
  map(.,~enet_processing_nocom(resp_var = .,contrast_name = "X2backvs0back_ROI_",followup_test = followup_analysis))

##change this code before redo the random forest
##the codes are changed but not run except for the first one
SST_enet_data <- resp_names %>% 
  map(.,~enet_processing_nocom(resp_var = .,
                             followup_test = followup_analysis,
                             contrast_name = "anystopvscorrectgo_ROI_"))

MID_enet_data <- resp_names %>% 
  map(.,~enet_processing_nocom(resp_var = .,
                               followup_test = followup_analysis,
                               contrast_name = "antiRewVsNeu_ROI_"))

DTI_enet_data <- resp_names %>% 
  map(.,~enet_processing_comOne(resp_var = .,
                                followup_test = followup_analysis,
                                measure_one = "FA_"))

smri_enet_data <- resp_names %>% 
  map(.,~enet_processing_com(resp_var = .,
                             followup_test = followup_analysis,
                             measure_one = "ASEG_",
                             measure_two = "Dest_Thick_"))

rsmri_enet_data <- resp_names %>% 
  map(.,~enet_processing_com(resp_var = .,
                             followup_test = followup_analysis,
                             measure_one = "par",
                             measure_two = "Within"))

enet_data_all <- list("Nback"=Nback_enet_data,
                      "SST"=SST_enet_data,
                      "MID"=MID_enet_data,
                      "rsmri"=rsmri_enet_data,
                      "smri"=smri_enet_data,
                      "DTI"=DTI_enet_data)

2.3.2 actual parameter tuning for elastic net for each cognitive task

##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)

Nback_results <- resp_names %>% 
  future_map(.,~enet_tuning(resp_var =.,
                            data_list = Nback_enet_data[[.]]),
             .options = furrr_options(seed=TRUE))

##change this code before redo the random forest
##the codes are changed but not run except for the first one
SST_results <- resp_names %>% 
  future_map(.,~enet_tuning(resp_var =.,
                            data_list = SST_enet_data[[.]]),
             .options = furrr_options(seed=TRUE))

MID_results <- resp_names %>% 
  future_map(.,~enet_tuning(resp_var =.,
                            data_list = MID_enet_data[[.]]),
             .options = furrr_options(seed=TRUE))

DTI_results <- resp_names %>% 
  future_map(.,~enet_tuning(resp_var =.
                            ,data_list = DTI_enet_data[[.]]),
             .options = furrr_options(seed=TRUE))

smri_results <- resp_names %>% 
  future_map(.,~enet_tuning(resp_var =.
                            ,data_list = smri_enet_data[[.]]),
             .options = furrr_options(seed=TRUE))

rsmri_results <- resp_names %>% 
  future_map(.,~enet_tuning(resp_var =.
                            ,data_list = rsmri_enet_data[[.]]),
             .options = furrr_options(seed=TRUE))

enet_results_all <- list("Nback_results"=Nback_results,
                         "SST_results"=SST_results,
                         "MID_results"=MID_results,
                         "rsmri_results"=rsmri_results,
                         "smri_results"=smri_results,
                         "DTI_results"=DTI_results)

2.3.3 getting all the numbers of participants in all the data splits

modality_vec <- c("Nback","SST","MID","rsmri","smri","DTI")
modality_results <- paste0(modality_vec,"_results")

datasplits_number = function(data_list, resp_vec, resp_plotting){
  number_datasplits <- modality_results %>% map(.,function(modality_resp=.){
  data_input <- data_list[[modality_resp]]
  one_modality <- resp_vec %>% map(.,function(resp=.){
  one_allset <- tibble(enet_train = data_input[[resp]][["obs_train"]],
                       stack_train=data_input[[resp]][["obs_valid"]],
                       baseline=dim(data_input[[resp]][["predicted_baseline"]])[1],
                       followup=dim(data_input[[resp]][["predicted_followup"]])[1],
                       response=resp) 
  return(one_allset)
  })
})

names(number_datasplits)<- modality_results

number_datasplits <- number_datasplits %>% map(.,~do.call(rbind,.))

number_datasplits <- number_datasplits %>% map(.,~left_join(.,resp_plotting,by="response"))

modality_results %>% map(.,~ mutate(number_datasplits[[.]],
                                    modality=str_remove_all(.,"_results"))%>%
                           select(modality,
                                  enet_train,
                                  stack_train,
                                  baseline,
                                  followup,
                                  short_name)%>% 
   kableExtra::kbl(caption = paste0("Sample size of all the data sets ")) %>%
   kableExtra::kable_classic(full_width = F, html_font = "Cambria") # seems to have problem knitting
   )
}

datasplits_number(data_list = enet_results_all,resp_vec = resp_names,resp_plotting = resp_var_plotting)

[[1]]

Sample size of all the data sets
modality	enet_train	stack_train	baseline	followup	short_name
Nback	1417	1385	2734	3183	2-back Work Mem
Nback	1397	1364	2719	3078	Pic Vocab
Nback	1398	1363	2709	3131	Flanker
Nback	1397	1361	2708	3118	Pattern Speed
Nback	1397	1364	2717	3129	Seq Memory
Nback	1392	1364	2712	3073	Reading Recog
Nback	1395	1361	2642	3175	Little Man
Nback	1392	1351	2682	3123	Audi Verbal
Nback	1162	1148	2285	2738	integration SSRT

[[2]]

Sample size of all the data sets
modality	enet_train	stack_train	baseline	followup	short_name
SST	1403	1428	2755	3001	2-back Work Mem
SST	1447	1458	2854	2987	Pic Vocab
SST	1447	1452	2848	3039	Flanker
SST	1445	1452	2848	3028	Pattern Speed
SST	1447	1456	2850	3040	Seq Memory
SST	1439	1458	2846	2983	Reading Recog
SST	1440	1452	2772	3085	Little Man
SST	1444	1446	2819	3022	Audi Verbal
SST	1469	1480	2872	3088	integration SSRT

[[3]]

Sample size of all the data sets
modality	enet_train	stack_train	baseline	followup	short_name
MID	1599	1594	3066	3221	2-back Work Mem
MID	1647	1635	3213	3222	Pic Vocab
MID	1646	1628	3210	3281	Flanker
MID	1647	1630	3206	3270	Pattern Speed
MID	1651	1635	3211	3279	Seq Memory
MID	1643	1634	3205	3215	Reading Recog
MID	1641	1631	3132	3326	Little Man
MID	1639	1621	3175	3261	Audi Verbal
MID	1295	1323	2579	2771	integration SSRT

[[4]]

Sample size of all the data sets
modality	enet_train	stack_train	baseline	followup	short_name
rsmri	1871	1900	3556	4051	2-back Work Mem
rsmri	2041	2056	3862	4270	Pic Vocab
rsmri	2037	2045	3858	4350	Flanker
rsmri	2039	2052	3853	4331	Pattern Speed
rsmri	2043	2054	3860	4347	Seq Memory
rsmri	2041	2054	3846	4258	Reading Recog
rsmri	2034	2054	3756	4417	Little Man
rsmri	2034	2041	3823	4327	Audi Verbal
rsmri	1503	1535	2970	3438	integration SSRT

[[5]]

Sample size of all the data sets
modality	enet_train	stack_train	baseline	followup	short_name
smri	2133	2178	4028	4380	2-back Work Mem
smri	2467	2484	4622	4724	Pic Vocab
smri	2465	2484	4619	4800	Flanker
smri	2464	2478	4611	4785	Pattern Speed
smri	2467	2485	4619	4805	Seq Memory
smri	2455	2482	4608	4712	Reading Recog
smri	2464	2488	4498	4881	Little Man
smri	2450	2472	4571	4775	Audi Verbal
smri	1661	1706	3292	3624	integration SSRT

[[6]]

Sample size of all the data sets
modality	enet_train	stack_train	baseline	followup	short_name
DTI	2104	2111	3742	4117	2-back Work Mem
DTI	2324	2312	4159	4369	Pic Vocab
DTI	2321	2313	4154	4439	Flanker
DTI	2322	2308	4151	4428	Pattern Speed
DTI	2326	2312	4155	4439	Seq Memory
DTI	2324	2312	4143	4357	Reading Recog
DTI	2317	2311	4054	4510	Little Man
DTI	2320	2301	4126	4412	Audi Verbal
DTI	1637	1650	3051	3424	integration SSRT

2.3.4 Fit the tuned enet model to enetXplorer for each cognitive task and plot them

enetXplorer allows us to use permutation to test the significance of each feature

### NOT RUN by default, takes long time

enet_results_xplorer <- enet_results_all
names(enet_results_xplorer) <- modality_vec

fitting_enetxplorer <- function(resp_vec , modality_var,data_input,tuning_input){
  data_split=data_input[[modality_var]]
enet_train <- resp_vec %>% map(., ~select(data_split[[.]][["ols_data_train"]],-all_of(.)))  
enet_test <- resp_vec %>% map(., ~select(data_split[[.]][["ols_test_baseline"]],-all_of(.))) 
enet_resp_train <- resp_vec %>% map(., ~select(data_split[[.]][["ols_data_train"]],all_of(.))) 
enet_resp_test <- resp_vec %>% map(., ~select(data_split[[.]][["ols_test_baseline"]],all_of(.)))

doParallel::registerDoParallel(cores = 20)

fit_enet_all_IQR <-resp_vec %>% 
  future_map(.,~eNetXplorer(x = as.matrix(enet_train[[.]]) , 
                            y = as.vector(enet_resp_train[[.]][[.]]), 
                            alpha =tuning_input[[modality_var]][[.]][["param_tune"]][["mixture"]], 
                            n_fold = 10,
                            #nlambda.ext = 10, 
                            #nlambda = 10, #uses the default 100 levels with the data-driven range defined by glmnet  
                            scaled = TRUE, 
                            seed = 123456),
             .options = furrr_options(seed=TRUE)) 
return(fit_enet_all_IQR)
}

##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)

enet_tuning_fit <- modality_vec  %>% 
  map(.,~fitting_enetxplorer(resp_vec = resp_names,
                             modality_var = .,
                             data_input = enet_data_all,
                             tuning_input = enet_results_xplorer))

names(enet_tuning_fit)<- modality_vec

2.3.5 results of enetXplorer for each cognitive task

here we show hyperparameters from enetXplorer.
Alpha is forced to be the same with the tidymodels approach. Lambda via glmnet adaptive range is close to the tidymodels approach. see more on this approach here
https://stats.stackexchange.com/questions/174897/choosing-the-range-and-grid-density-for-regularization-parameter-in-lasso The accuracy is out-of-bag correlations between predicted and observed within the training set. P value is based on the permutation test.

enet_fit_table <- function(modality_input,fit_input, tune_input,resp_input, plotting_names ){
  fit_enet_all_IQR <- fit_input[[modality_input]]
  glmn_tuning_all_IQR <- tune_input[[modality_input]]
  lambdas_all <- vector("list", length = length(resp_input))
  names(lambdas_all)<- resp_input
  lambdas_all_best <- vector("list", length = length(resp_input))
  names(lambdas_all_best)<- resp_input
  summary_enet_all <- vector("list", length = length(resp_input))
  names(summary_enet_all) <- resp_input

for(i in 1:length(resp_input)){
  lambdas_all[[resp_input[i]]] <- fit_enet_all_IQR[[resp_input[i]]][["lambda_values"]]
  lambdas_all_best[[resp_input[i]]] <- fit_enet_all_IQR[[resp_input[i]]][["best_lambda"]]
  summary_enet_all[[resp_input[i]]]<- as_tibble(summary(fit_enet_all_IQR[[resp_input[i]]])[[2]]) %>% slice(1)
}

summary_enet_all_test <- summary_enet_all %>% bind_rows() %>% 
  mutate(respones = plotting_names$short_name)

summary_vec <- colnames(summary_enet_all_test)[-5]

summary_enet_all_test <- summary_vec %>% map(.,function(var_input=.){
  summary_enet_all_test[[paste0(var_input)]] <- round(summary_enet_all_test[[paste0(var_input)]],4)
})
names(summary_enet_all_test)<- summary_vec

summary_enet_all_test <- summary_enet_all_test %>% bind_rows()%>% 
  mutate(respones = plotting_names$short_name)%>%
  rename(.,Alpha =  alpha, `Best-tune lambda` = lambda.max, 
         `Accuracy (Pearson r)` = QF.est, 
         `P-value` = model.vs.null.pval) %>%
  #knitr::kable(caption = paste0("EnetXplorer fitting of ", modality_input))
   kableExtra::kbl(caption = paste0("EnetXplorer fitting of ", modality_input)) %>%
   kableExtra::kable_classic(full_width = F, html_font = "Cambria") 
# seems to have problem knitting
##problem solved by adding results='asis' option in knitting
}

 modality_vec %>% map(.,~enet_fit_table(modality_input=.,
                                        fit_input = enet_tuning_fit, 
                                        tune_input = enet_results_xplorer,
                                        resp_input = resp_names,
                                        plotting_names = resp_var_plotting))

[[1]]

EnetXplorer fitting of Nback
Alpha	Best-tune lambda	Accuracy (Pearson r)	P-value	respones
0.0	0.7398	0.4706	0.0004	2-back Work Mem
0.0	0.8318	0.3464	0.0004	Pic Vocab
0.0	1.5030	0.1769	0.0004	Flanker
0.3	0.1277	0.1183	0.0004	Pattern Speed
0.0	4.0530	0.1895	0.0004	Seq Memory
0.0	2.4728	0.2959	0.0004	Reading Recog
0.1	0.1947	0.2426	0.0004	Little Man
0.1	0.2181	0.1645	0.0004	Audi Verbal
0.0	0.8539	0.1153	0.0012	integration SSRT

[[2]]

EnetXplorer fitting of SST
Alpha	Best-tune lambda	Accuracy (Pearson r)	P-value	respones
0.5	0.0254	0.1282	0.0008	2-back Work Mem
0.2	0.0484	0.1658	0.0004	Pic Vocab
0.5	0.0476	0.1196	0.0012	Flanker
0.0	0.0593	0.0581	0.0496	Pattern Speed
0.1	0.0018	0.0052	0.4390	Seq Memory
0.0	0.0062	0.0741	0.0216	Reading Recog
0.8	0.0314	0.0936	0.0064	Little Man
0.9	0.0518	0.0549	0.0396	Audi Verbal
0.0	0.5711	0.3137	0.0004	integration SSRT

[[3]]

EnetXplorer fitting of MID
Alpha	Best-tune lambda	Accuracy (Pearson r)	P-value	respones
0.0	0.6653	0.1379	0.0004	2-back Work Mem
0.0	0.8611	0.2202	0.0004	Pic Vocab
0.0	2.6097	0.0704	0.0112	Flanker
0.0	2.0368	0.0783	0.0028	Pattern Speed
0.0	2.8319	0.0815	0.0064	Seq Memory
0.1	0.1873	0.2257	0.0004	Reading Recog
0.1	0.1866	0.0916	0.0040	Little Man
0.0	1.7944	0.1472	0.0004	Audi Verbal
1.0	0.0233	0.0760	0.0224	integration SSRT

[[4]]

EnetXplorer fitting of rsmri
Alpha	Best-tune lambda	Accuracy (Pearson r)	P-value	respones
0	0.0512	0.1677	0.0004	2-back Work Mem
0	0.5142	0.2210	0.0004	Pic Vocab
0	3.0027	0.0962	0.0004	Flanker
0	0.3233	0.0822	0.0032	Pattern Speed
0	1.1274	0.0997	0.0004	Seq Memory
0	0.0907	0.1959	0.0004	Reading Recog
1	0.0304	0.1415	0.0004	Little Man
0	3.9336	0.0995	0.0008	Audi Verbal
1	0.0351	0.0645	0.0328	integration SSRT

[[5]]

EnetXplorer fitting of smri
Alpha	Best-tune lambda	Accuracy (Pearson r)	P-value	respones
0.0	4.8930	0.1455	0.0004	2-back Work Mem
0.2	0.0418	0.2112	0.0004	Pic Vocab
1.0	0.0328	0.0754	0.0060	Flanker
0.0	1.8544	0.0612	0.0100	Pattern Speed
0.0	2.7896	0.1288	0.0004	Seq Memory
0.0	0.9688	0.1869	0.0004	Reading Recog
0.0	2.9783	0.1447	0.0004	Little Man
0.1	0.2536	0.0920	0.0004	Audi Verbal
0.0	4.6013	0.0647	0.0144	integration SSRT

[[6]]

EnetXplorer fitting of DTI
Alpha	Best-tune lambda	Accuracy (Pearson r)	P-value	respones
0.0	0.7204	0.1089	0.0004	2-back Work Mem
0.0	0.0129	0.2142	0.0004	Pic Vocab
0.2	0.0295	0.1105	0.0004	Flanker
0.0	0.3508	0.1073	0.0004	Pattern Speed
0.0	0.4257	0.1005	0.0004	Seq Memory
1.0	0.0019	0.1863	0.0004	Reading Recog
0.1	0.0544	0.1337	0.0004	Little Man
0.0	0.0939	0.0991	0.0004	Audi Verbal
0.0	1.5825	-0.0135	0.4242	integration SSRT

2.3.6 function to extract output from enetXplorer

extract_tibble <- function(elastic_mod, alpha_index) {
   variable <- elastic_mod$feature_coef_wmean[, alpha_index] %>% names()
    wmean <- elastic_mod$feature_coef_wmean[, alpha_index]
    wsd <- elastic_mod$feature_coef_wsd[, alpha_index]
    null_wmean <- elastic_mod$null_feature_coef_wmean[, alpha_index]
    null_wsd <- elastic_mod$null_feature_coef_wsd[, alpha_index]
    pvalue <- elastic_mod$feature_coef_model_vs_null_pval[, alpha_index]
    
    tib <- tibble(variable, wmean, wsd, null_wmean, null_wsd, pvalue)
   
    tib <- tib %>%
      gather(key = 'placeholder', value = 'value', wmean, wsd, null_wmean, null_wsd) %>%
      mutate(type = ifelse(str_detect(placeholder, 'null'), 'null', 'target'),
             placeholder = (str_remove(placeholder, 'null_'))) %>%
      mutate(type = factor(type, labels = c('Null', 'Target'))) %>%
      spread(placeholder, value)
    tib
}

2.3.7 Plot results from eNetExplorer with task-fMRI predicting each cognitive task

modality_type_vec <- c("Nback","SST","MID")
str_type_vec <- c("X2backvs0back_ROI_","anystopvscorrectgo_ROI_","antiRewVsNeu_ROI_")

enet_interval_plot <- 
  function(data_input,
           tuning_input,
           modality_input,
           str_input,
           resp_input,
           plotting_names=resp_var_plotting){
    
    fit_enet_all_IQR <- data_input[[modality_input]]
    names(tuning_input)<- modality_vec
    glmn_tuning_all_IQR <- tuning_input[[modality_input]]
    
    coefs_enet_all <- resp_input %>%  
      map(.,
          ~extract_tibble(
            fit_enet_all_IQR[[.]], 
            alpha_index = paste0("a",glmn_tuning_all_IQR[[.]][["param_tune"]][["mixture"]])))
    
    coefs_enet_all <- coefs_enet_all  %>% 
      map(.,~filter(.,pvalue < 0.05) %>%
            mutate(.,type = ifelse(type == 'Null', 
                                   'Null permuted models', 
                                   'Target models'),
                   roi = str_remove(variable, str_input))%>%
            left_join( .,new_shorter_names,by="roi"))
    
    roi_num_enet <- coefs_enet_all %>% map(.,~dim(.)[1])  
    
    max_roi_enet <- max(as.numeric(roi_num_enet))
    coefs_enet_test <- coefs_enet_all[[resp_input[1]]] %>% 
      group_by(type)
    
    coefs_enet_test <- coefs_enet_test%>% nest(-type)
    
    coefs_enet_test[[2]][[1]] <- 
      coefs_enet_test[[2]][[1]] %>% 
      mutate(direction1 = 
               ifelse(coefs_enet_test[[2]][[1]]$wmean >=
                        median(coefs_enet_test[[2]][[1]]$wmean)|roi_num_enet[[resp_input[1]]] <=
                        floor(max_roi_enet/2),"big","small"))
    
    coefs_enet_test[[2]][[2]] <- coefs_enet_test[[2]][[2]] %>% 
      mutate(direction1=coefs_enet_test[[2]][[1]]$direction1)

    coefs_enet_test <- coefs_enet_test %>% unnest()
    
    coefs_enet_all <- coefs_enet_all%>%map(.,~group_by(.,type)) 
    
    coefs_enet_all <- coefs_enet_all%>%map(.,~nest(.,-type)) 

for(i in 1:length(resp_input)){
  coefs_enet_all[[resp_input[i]]][["data"]][[2]]<-
    coefs_enet_all[[resp_input[i]]][["data"]][[2]] %>% 
    mutate(direction = ifelse(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean >= median(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean)|roi_num_enet[[resp_input[i]]] <= floor(max_roi_enet/2),"big","small"))

    coefs_enet_all[[resp_input[i]]][["data"]][[1]] <- 
      coefs_enet_all[[resp_input[i]]][["data"]][[1]] %>%
      mutate(direction=coefs_enet_all[[resp_input[i]]][["data"]][[2]]$direction)
}

coefs_enet_all <- coefs_enet_all %>%map(.,~unnest(.)) 

resp_input %>% map(.,~ggplot(coefs_enet_all[[.]], 
                             aes(x = fct_reorder(roiShort, wmean), 
                                 y = wmean, 
                                 ymax = wmean + 2 * wsd, 
                                 ymin = wmean - 2 * wsd,
                                 col = type)) +
  geom_pointrange(fatten = 0.5, key_glyph = 'point') +
  scale_y_continuous(labels = numform::ff_num(zero = 0, digits = 2)) + 
  scale_color_grey(start = 0.7, end = 0.5) +
  coord_flip() +
  guides(colour = guide_legend(override.aes = list(size = 2.5)))+
  labs(x = 'Explanatory Variables (Brain Regions)', 
       y = 'Averaged Coefficient Across Models (±2 SD)', 
       col = 'Model type',
       title = paste0(modality_input, " task-fMRI Predicting\n",
                      plotting_names$longer_name[[which(plotting_names$response==.)]])) +
  facet_wrap(~ direction, scales = 'free_y') +
  scale_color_manual(values = c("#56B4E9", "black"),labels = c("Permuted Null", "Target")) +     
  theme_bw() +  
#theme(axis.text.y = element_text(angle = 15)) +
theme(legend.title=element_blank()) +  
theme(legend.position = "top") + 
theme(
  axis.title.x = element_text(size = 15),
  axis.text.x = element_text(size = 12),
  axis.title.y = element_text(size = 15),
  axis.text.y = element_text(size = 12),
  legend.text = element_text(size = 15),
  plot.title = element_text(size=15)) +
 theme(
    strip.background = element_blank(),
    strip.text.x = element_blank()
) 
+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
)

  } 

map2(.x=modality_type_vec,
     .y=str_type_vec,
     ~enet_interval_plot(data_input=enet_tuning_fit,
                         tuning_input=enet_results_all,
                         resp_input = resp_names,
                         modality_input = .x,
                         str_input = .y,
                         plotting_names=resp_var_plotting))

## [[1]]
## [[1]]$TFMRI_NB_ALL_BEH_C2B_RATE

## 
## [[1]]$NIHTBX_PICVOCAB_UNCORRECTED

## 
## [[1]]$NIHTBX_FLANKER_UNCORRECTED

## 
## [[1]]$NIHTBX_PATTERN_UNCORRECTED

## 
## [[1]]$NIHTBX_PICTURE_UNCORRECTED

## 
## [[1]]$NIHTBX_READING_UNCORRECTED

## 
## [[1]]$LMT_SCR_PERC_CORRECT

## 
## [[1]]$PEA_RAVLT_LD_TRIAL_VII_TC

## 
## [[1]]$TFMRI_SST_ALL_BEH_TOTAL_ISSRT

## 
## 
## [[2]]
## [[2]]$TFMRI_NB_ALL_BEH_C2B_RATE

## 
## [[2]]$NIHTBX_PICVOCAB_UNCORRECTED

## 
## [[2]]$NIHTBX_FLANKER_UNCORRECTED

## 
## [[2]]$NIHTBX_PATTERN_UNCORRECTED

## 
## [[2]]$NIHTBX_PICTURE_UNCORRECTED

## 
## [[2]]$NIHTBX_READING_UNCORRECTED

## 
## [[2]]$LMT_SCR_PERC_CORRECT

## 
## [[2]]$PEA_RAVLT_LD_TRIAL_VII_TC

## 
## [[2]]$TFMRI_SST_ALL_BEH_TOTAL_ISSRT

## 
## 
## [[3]]
## [[3]]$TFMRI_NB_ALL_BEH_C2B_RATE

## 
## [[3]]$NIHTBX_PICVOCAB_UNCORRECTED

## 
## [[3]]$NIHTBX_FLANKER_UNCORRECTED

## 
## [[3]]$NIHTBX_PATTERN_UNCORRECTED

## 
## [[3]]$NIHTBX_PICTURE_UNCORRECTED

## 
## [[3]]$NIHTBX_READING_UNCORRECTED

## 
## [[3]]$LMT_SCR_PERC_CORRECT

## 
## [[3]]$PEA_RAVLT_LD_TRIAL_VII_TC

## 
## [[3]]$TFMRI_SST_ALL_BEH_TOTAL_ISSRT

2.3.8 Plot results from eNetExplorer with sMRI predicting each cognitive task

enet_interval_plot_smri <- function(data_input,tuning_input,resp_input,plotting_names=resp_var_plotting){
    fit_enet_all_IQR <- data_input[["smri"]]
    names(tuning_input)<- modality_vec
    glmn_tuning_all_IQR <- tuning_input[["smri"]]
    
      coefs_enet_all <- resp_input %>%  map(.,~extract_tibble(fit_enet_all_IQR[[.]], alpha_index = paste0("a",glmn_tuning_all_IQR[[.]][["param_tune"]][["mixture"]])))
      roi_vec <- coefs_enet_all[[1]][["variable"]]
      roi_vec <- gsub("ASEG_","",roi_vec)
      roi_vec <- gsub("Dest_Thick_","",roi_vec)
      roi_frame <- tibble(variable=coefs_enet_all[[1]][["variable"]],roi=roi_vec)
    coefs_enet_all <- coefs_enet_all  %>% map(.,~left_join(.,roi_frame,by="variable")%>%filter(.,pvalue < 0.05) %>%
  mutate(.,type = ifelse(type == 'Null', 'Null permuted models', 'Target models'))%>%
    left_join( .,new_shorter_names,by="roi"))
  
roi_num_enet <- coefs_enet_all %>% map(.,~dim(.)[1])  

max_roi_enet <- max(as.numeric(roi_num_enet))
coefs_enet_test <- coefs_enet_all[[resp_input[1]]] %>% 
  group_by(type)
coefs_enet_test <- coefs_enet_test%>% nest(-type)
 coefs_enet_test[[2]][[1]] <- coefs_enet_test[[2]][[1]] %>% 
   mutate(direction1 = ifelse(coefs_enet_test[[2]][[1]]$wmean >= median(coefs_enet_test[[2]][[1]]$wmean)|roi_num_enet[[resp_input[1]]] <= floor(max_roi_enet/2),"big","small"))
coefs_enet_test[[2]][[2]] <- coefs_enet_test[[2]][[2]] %>% mutate(direction1=coefs_enet_test[[2]][[1]]$direction1)
coefs_enet_test <- coefs_enet_test %>% unnest()

coefs_enet_all <- coefs_enet_all%>%map(.,~group_by(.,type)) 
coefs_enet_all <- coefs_enet_all%>%map(.,~nest(.,-type)) 
for(i in 1:length(resp_input)){
  coefs_enet_all[[resp_input[i]]][["data"]][[2]]<-coefs_enet_all[[resp_input[i]]][["data"]][[2]] %>% mutate(direction = ifelse(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean >= median(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean)|roi_num_enet[[resp_input[i]]] <= floor(max_roi_enet/2),"big","small"))
  coefs_enet_all[[resp_input[i]]][["data"]][[1]] <- coefs_enet_all[[resp_input[i]]][["data"]][[1]] %>% mutate(direction=coefs_enet_all[[resp_input[i]]][["data"]][[2]]$direction)
}

coefs_enet_all <- coefs_enet_all %>%map(.,~unnest(.)) 

resp_input %>% map(.,~ggplot(coefs_enet_all[[.]], aes(x = fct_reorder(roiShort, wmean), 
             y = wmean, 
             ymax = wmean + 2 * wsd, ymin = wmean - 2 * wsd,
             col = type)) +
  geom_pointrange(fatten = 0.5, key_glyph = 'point') +
  scale_y_continuous(labels = numform::ff_num(zero = 0, digits = 2)) + 
  scale_color_grey(start = 0.7, end = 0.5) +
  coord_flip() +
  guides(colour = guide_legend(override.aes = list(size = 2.5)))+
  labs(x = 'Explanatory Variables (Brain Regions)', 
       y = 'Averaged Coefficient Across Models (±2 SD)', 
       col = 'Model type',
       title = paste0("sMRI Predicting\n",plotting_names$longer_name[[which(plotting_names$response==.)]])) +
  facet_wrap(~ direction, scales = 'free_y') +
  scale_color_manual(values = c("#56B4E9", "black"),labels = c("Permuted Null", "Target")) +     
  theme_bw() +  
#theme(axis.text.y = element_text(angle = 15)) +
theme(legend.title=element_blank()) +  
theme(legend.position = "top") + 
theme(
  axis.title.x = element_text(size = 15),
  axis.text.x = element_text(size = 12),
  axis.title.y = element_text(size = 15),
  axis.text.y = element_text(size = 12),
  legend.text = element_text(size = 15),
  plot.title = element_text(size=15)) +
 theme(
    strip.background = element_blank(),
    strip.text.x = element_blank()
) 
+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
)

  } 



enet_interval_plot_smri(data_input=enet_tuning_fit,tuning_input=enet_results_all,resp_input = resp_names,plotting_names=resp_var_plotting)

## $TFMRI_NB_ALL_BEH_C2B_RATE

## 
## $NIHTBX_PICVOCAB_UNCORRECTED

## 
## $NIHTBX_FLANKER_UNCORRECTED

## 
## $NIHTBX_PATTERN_UNCORRECTED

## 
## $NIHTBX_PICTURE_UNCORRECTED

## 
## $NIHTBX_READING_UNCORRECTED

## 
## $LMT_SCR_PERC_CORRECT

## 
## $PEA_RAVLT_LD_TRIAL_VII_TC

## 
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT

2.3.9 Plot results from eNetExplorer with rs-fMRI predicting each cognitive task

roi_names <- colnames(select(rsmri_par_data, starts_with("parAuditory")))%>% str_remove_all(.,"parAuditory")
roi_names_all <- c("Auditory",roi_names)
par_roi_names_all <- paste0("par", roi_names_all)


matrix_prepare <- function(data_list){
  heat_matrix <- matrix(rep(0,13*13),nrow = 13)
  for(i in 1:dim(data_list)[1]){
  if(str_detect(data_list$roi[i],paste0("^","par"))==TRUE){
coor_matrix_1 <- which(as.vector(par_roi_names_all %>% map(.,~str_detect(data_list$roi[i],paste0("\\A",.)))%>%do.call(rbind,.)) ==TRUE) 
coor_matrix_2 <- which(as.vector(roi_names_all %>% map(.,~str_detect(data_list$roi[i],paste0(.,"$")))%>%do.call(rbind,.)) ==TRUE) 
heat_matrix[coor_matrix_1,coor_matrix_2]<- heat_matrix[coor_matrix_2,coor_matrix_1]<- data_list$estimate[i]
}
if(str_detect(data_list$roi[i],paste0("^","Within"))==TRUE){
  coor_matrix <- which(as.vector(roi_names_all %>% map(.,~str_detect(data_list$roi[i],paste0("Within",.)))%>%do.call(rbind,.)) ==TRUE) 
  heat_matrix[coor_matrix,coor_matrix]<- data_list$estimate[i]
}
  }
  heat_matrix[lower.tri(heat_matrix)]<- NA
return(heat_matrix)
  }


enet_interval_plot_rsmri <- function(data_input,tuning_input,resp_input,plotting_names=resp_var_plotting){
    fit_ridge_all_RS_par <- data_input[["rsmri"]]
    names(tuning_input)<- modality_vec
    glmn_tuning_all_RS_par <- tuning_input[["rsmri"]]
    
      coefs_ridge_all <- resp_input %>%  map(.,~extract_tibble(fit_ridge_all_RS_par[[.]], alpha_index = paste0("a",glmn_tuning_all_RS_par[[.]][["param_tune"]][["mixture"]])))
    coefs_ridge_all <- coefs_ridge_all  %>% map(.,~filter(.,pvalue < 0.05) %>%
  mutate(.,type = ifelse(type == 'Null', 'Null permuted models', 'Target models'),roi=variable,roi_short= str_remove_all(variable,"^par")))
roi_num_ridge <- coefs_ridge_all %>% map(.,~dim(.)[1])
max_roi_ridge <- max(as.numeric(roi_num_ridge))

coefs_ridge_test <- coefs_ridge_all[[resp_input[1]]] %>% group_by(type)
coefs_ridge_test <- coefs_ridge_test%>% nest(-type)
 coefs_ridge_test[[2]][[1]] <- coefs_ridge_test[[2]][[1]] %>% mutate(direction1 = ifelse(coefs_ridge_test[[2]][[1]]$wmean >= median(coefs_ridge_test[[2]][[1]]$wmean)|roi_num_ridge[[resp_input[1]]] <= floor(max_roi_ridge/2),"big","small"))
coefs_ridge_test[[2]][[2]] <- coefs_ridge_test[[2]][[2]] %>% mutate(direction1=coefs_ridge_test[[2]][[1]]$direction1)
coefs_ridge_test <- coefs_ridge_test %>% unnest()

coefs_ridge_all <- coefs_ridge_all%>%map(.,~group_by(.,type)) 
coefs_ridge_all <- coefs_ridge_all%>%map(.,~nest(.,-type)) 
for(i in 1:length(resp_input)){
  coefs_ridge_all[[resp_input[i]]][["data"]][[2]]<-coefs_ridge_all[[resp_input[i]]][["data"]][[2]] %>% mutate(direction = ifelse(coefs_ridge_all[[resp_input[i]]][["data"]][[2]]$wmean >= median(coefs_ridge_all[[resp_input[i]]][["data"]][[2]]$wmean)|roi_num_ridge[[resp_input[i]]] <= floor(max_roi_ridge/2),"big","small"))
  coefs_ridge_all[[resp_input[i]]][["data"]][[1]] <- coefs_ridge_all[[resp_input[i]]][["data"]][[1]] %>% mutate(direction=coefs_ridge_all[[resp_input[i]]][["data"]][[2]]$direction)
}
coefs_ridge_all <- coefs_ridge_all %>%map(.,~unnest(.))

matrix_ridge<- coefs_ridge_all %>% map(.,~filter(., type=="Target models")%>% mutate(.,estimate=wmean)%>%matrix_prepare(.))

for(i in 1:length(resp_input)){
  colnames(matrix_ridge[[resp_input[i]]]) <- roi_names_all
  rownames(matrix_ridge[[resp_input[i]]])<- roi_names_all
}
matrix_ridge_test <- matrix_ridge%>% map(.,~as.data.frame(.)%>%mutate(roi = roi_names_all)%>%reshape2::melt(.,id.vars="roi")) 

method_vec <- c("Enet")
matrix_list <- list(Enet = matrix_ridge_test)
matrix_list_name <- method_vec %>% map(.,function(method_used=.){
  matrix_list_name <- resp_input%>%map(.,~mutate(matrix_list[[method_used]][[.]], method=method_used))
 # matrix_list_name <- resp_input %>% map(.,~range(matrix_list_name[[method_used]][[.]]$value,na.rm=TRUE)fileDate = '31Dec2020') 
  return(matrix_list_name)
})
 names(matrix_list_name)=method_vec
 legend_all <- method_vec %>% map(.,function(method_used=.){
    legend_all <- resp_input %>% map(.,~range(matrix_list_name[[method_used]][[.]]$value,na.rm=TRUE)) 
  return(legend_all)
 })
 names(legend_all)<- method_vec
legend_all_test <- t(as.matrix (as.data.frame(legend_all))) 
legend_test <- tibble("low"=legend_all_test[,1],"upp"=legend_all_test[,2],"resp"=resp_input, "method"=c(rep("Elastic Net",length(resp_input))))

legend_resp <- vector("list", length = length(resp_input))
names(legend_resp)<- resp_input
for(i in 1:length(resp_input)){
  legend_resp[[resp_input[i]]]<- tibble("upp"= max(legend_test$upp[which(legend_test$resp==resp_input[i])]),
                                               "low"=min(legend_test$low[which(legend_test$resp==resp_input[i])])) 
}

resp_input%>% map(.,~ggplot(data =matrix_ridge_test[[.]], aes(x=variable, y=roi, fill=value)) +
 geom_tile(color = "white")+
   labs(x=NULL,y=NULL,title=paste0("rs-fMRI Predicting\n", plotting_names$longer_name[[which(plotting_names$response==.)]]))+
 scale_fill_gradient2(low = muted("blue") , high =muted("red"), mid = "grey80", 
   midpoint = 0,na.value = "grey50", limit = c(legend_resp[[.]]$low, legend_resp[[.]]$upp), space = "Lab", 
   name="Parameter\nEstimation") +
  theme_minimal()+ 
 theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1),
      axis.text.y = element_text(angle = 45, vjust = 1, size = 12, hjust = 1),
     #axis.text.x = element_blank(),
     #axis.text.y = element_blank(),
    plot.title = element_text(size=17))+
 coord_fixed())

  } 

enet_interval_plot_rsmri(data_input=enet_tuning_fit,tuning_input=enet_results_all,resp_input = resp_names,plotting_names=resp_var_plotting)

## $TFMRI_NB_ALL_BEH_C2B_RATE

## 
## $NIHTBX_PICVOCAB_UNCORRECTED

## 
## $NIHTBX_FLANKER_UNCORRECTED

## 
## $NIHTBX_PATTERN_UNCORRECTED

## 
## $NIHTBX_PICTURE_UNCORRECTED

## 
## $NIHTBX_READING_UNCORRECTED

## 
## $LMT_SCR_PERC_CORRECT

## 
## $PEA_RAVLT_LD_TRIAL_VII_TC

## 
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT

2.3.10 Plot results from eNetExplorer with DTI predicting each cognitive task

enet_interval_plot_DTI <- function(data_input, tuning_input,resp_input,plotting_names=resp_var_plotting){
    fit_enet_all_IQR <- data_input[["DTI"]]
    names(tuning_input)<- modality_vec
    glmn_tuning_all_IQR <- tuning_input[["DTI"]]
    
      coefs_enet_all <- resp_input %>%  map(.,~extract_tibble(fit_enet_all_IQR[[.]], alpha_index = paste0("a",glmn_tuning_all_IQR[[.]][["param_tune"]][["mixture"]])))
    coefs_enet_all <- coefs_enet_all  %>% map(.,~filter(.,pvalue < 0.05) %>%
  mutate(.,type = ifelse(type == 'Null', 'Null permuted models', 'Target models'),
         roi = str_remove(variable, "FA_")))
roi_num_enet <- coefs_enet_all %>% map(.,~dim(.)[1])  

max_roi_enet <- max(as.numeric(roi_num_enet))
coefs_enet_test <- coefs_enet_all[[resp_input[1]]] %>% group_by(type)
coefs_enet_test <- coefs_enet_test%>% nest(-type)
 coefs_enet_test[[2]][[1]] <- coefs_enet_test[[2]][[1]] %>% mutate(direction1 = ifelse(coefs_enet_test[[2]][[1]]$wmean >= median(coefs_enet_test[[2]][[1]]$wmean)|roi_num_enet[[resp_input[1]]] <= floor(max_roi_enet/2),"big","small"))
coefs_enet_test[[2]][[2]] <- coefs_enet_test[[2]][[2]] %>% mutate(direction1=coefs_enet_test[[2]][[1]]$direction1)
coefs_enet_test <- coefs_enet_test %>% unnest()

coefs_enet_all <- coefs_enet_all%>%map(.,~group_by(.,type)) 
coefs_enet_all <- coefs_enet_all%>%map(.,~nest(.,-type)) 
for(i in 1:length(resp_input)){
  coefs_enet_all[[resp_input[i]]][["data"]][[2]]<-coefs_enet_all[[resp_input[i]]][["data"]][[2]] %>% mutate(direction = ifelse(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean >= median(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean)|roi_num_enet[[resp_input[i]]] <= floor(max_roi_enet/2),"big","small"))
  coefs_enet_all[[resp_input[i]]][["data"]][[1]] <- coefs_enet_all[[resp_input[i]]][["data"]][[1]] %>% mutate(direction=coefs_enet_all[[resp_input[i]]][["data"]][[2]]$direction)
}

coefs_enet_all <- coefs_enet_all %>%map(.,~unnest(.)) 

resp_input %>% map(.,~ggplot(coefs_enet_all[[.]], aes(x = fct_reorder(roi, wmean), 
             y = wmean, 
             ymax = wmean + 2 * wsd, ymin = wmean - 2 * wsd,
             col = type)) +
  geom_pointrange(fatten = 0.5, key_glyph = 'point') +
  scale_y_continuous(labels = numform::ff_num(zero = 0, digits = 2)) + 
  scale_color_grey(start = 0.7, end = 0.5) +
  coord_flip() +
  guides(colour = guide_legend(override.aes = list(size = 2.5)))+
  labs(x = 'Explanatory Variables (Brain Regions)', y = 'Averaged Coefficient Across Models (±2 SD)', col = 'Model type',
       title = paste0("DTI Predicting\n", plotting_names$longer_name[[which(plotting_names$response==.)]])) +
  #facet_wrap(~ direction, scales = 'free_y') +
  scale_color_manual(values = c("#56B4E9", "black"),labels = c("Permuted Null", "Target")) +     
  theme_bw() +  
#theme(axis.text.y = element_text(angle = 15)) +
theme(legend.title=element_blank()) +  
theme(legend.position = "top") + 
theme(
  axis.title.x = element_text(size = 15),
  axis.text.x = element_text(size = 12),
  axis.title.y = element_text(size = 15),
  axis.text.y = element_text(size = 12),
  legend.text = element_text(size = 15),
  plot.title = element_text(size=15)) +
 theme(
    strip.background = element_blank(),
    strip.text.x = element_blank()
) 
+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
)

  } 


enet_interval_plot_DTI(data_input=enet_tuning_fit,
                       tuning_input=enet_results_all,
                       resp_input = resp_names,
                       plotting_names=resp_var_plotting)

## $TFMRI_NB_ALL_BEH_C2B_RATE

## 
## $NIHTBX_PICVOCAB_UNCORRECTED

## 
## $NIHTBX_FLANKER_UNCORRECTED

## 
## $NIHTBX_PATTERN_UNCORRECTED

## 
## $NIHTBX_PICTURE_UNCORRECTED

## 
## $NIHTBX_READING_UNCORRECTED

## 
## $LMT_SCR_PERC_CORRECT

## 
## $PEA_RAVLT_LD_TRIAL_VII_TC

## 
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT

2.4 Combine the predicted values with Random Forest Oppornistic Stacking

2.4.1 Random Forest Data preparation for each cognitive task

Her we replace NA from each modality with unlikely small and large values, resulting in 2 predictors for each modality

## the following function just process the data to put into random forest model  
  rnd_for_data_prep = function(pred_select = "predicted",
                               data_list,
                               resp_chose,
                               resp_data=assessment(baseline_two_fold$splits[[1]]))
    {
    
      enet_pred <- plyr::join_all(list( 
        data_list[["MID_results"]][[resp_chose]][[pred_select]]%>% 
          dplyr::rename("MID_pred"="pred"),
        data_list[["smri_results"]][[resp_chose]][[pred_select]]%>% 
          dplyr::rename("smri_pred"="pred"),
        data_list[["SST_results"]][[resp_chose]][[pred_select]]%>% 
          dplyr::rename("SST_pred"="pred"),
        data_list[["rsmri_results"]][[resp_chose]][[pred_select]]%>% 
          dplyr::rename("rsmri_pred"="pred"),
        data_list[["Nback_results"]][[resp_chose]][[pred_select]]%>% 
          dplyr::rename("Nback_pred"="pred"),
        data_list[["DTI_results"]][[resp_chose]][[pred_select]]%>% 
          dplyr::rename("DTI_pred"="pred")), 
        by='SUBJECTKEY', type='full') %>% 
  distinct(SUBJECTKEY, .keep_all = TRUE)   
      
      enet_pred_p <- enet_pred%>% mutate_all(~replace(., is.na(.), 1000))
      enet_pred_n <- enet_pred%>% mutate_all(~replace(., is.na(.), -1000))
      
      enet_pred_noNA <- left_join(enet_pred_p,enet_pred_n,by="SUBJECTKEY")
      resp_data <- select(resp_data,"SUBJECTKEY",all_of(resp_chose)) %>% 
        distinct(SUBJECTKEY, .keep_all = TRUE) %>%   
        drop_na() %>% 
        mutate_if(is.double, ~ (.x - mean(.x)) / sd(.x))
      
      enet_pred_noNA <- left_join(enet_pred_noNA,resp_data,by="SUBJECTKEY")
      enet_pred_noNA_noID <- select(enet_pred_noNA,-SUBJECTKEY)
      
      return(list("ID"=enet_pred_noNA,"NoID"=enet_pred_noNA_noID))
  }

enet_rf_valid_data <- resp_names %>% 
  map(.,~rnd_for_data_prep(data_list = enet_results_all, 
                           resp_chose = all_of(.)))

enet_rf_test_baseline <- resp_names %>% 
  map(.,~rnd_for_data_prep(pred_select="predicted_baseline", 
                           data_list = enet_results_all, 
                           resp_chose = all_of(.),
                           resp_data = baseline_test))

enet_rf_test_followup <- resp_names %>% 
  map(.,~rnd_for_data_prep(pred_select="predicted_followup", 
                           data_list = enet_results_all, 
                           resp_chose = all_of(.),
                           resp_data = followup_analysis))


stack_datasize_extract <- function(resp_input, data_input){
  one_allset <- data.frame(datasplit_number =dim(data_input[[resp_input]][["ID"]])[1],
                       response=resp_input)
  return(one_allset)}

task_stack_valid <- resp_names %>% 
  map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_valid_data))%>%
  do.call(rbind,.)%>%
  mutate(number_valid = datasplit_number)%>%
  subset(.,select=-c(datasplit_number))

task_stack_baseline <- resp_names %>% 
  map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_test_baseline))%>%
  do.call(rbind,.)%>%
  mutate(baseline_test = datasplit_number)%>%
  subset(.,select=-c(datasplit_number))


task_stack_followup <- resp_names %>% 
  map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_test_followup))%>%
  do.call(rbind,.)%>%
  mutate(followup_test = datasplit_number)%>%
  subset(.,select=-c(datasplit_number))

plyr::join_all(list(task_stack_valid,task_stack_baseline,task_stack_followup,resp_var_plotting), 
                 type="left", by = "response")%>%
  select("short_name","number_valid",ends_with("test"))%>%
  kableExtra::kbl(caption = paste0("Sample size of all the data sets in stacked models ")) %>%
   kableExtra::kable_classic(full_width = F, html_font = "Cambria")

Sample size of all the data sets in stacked models
short_name	number_valid	baseline_test	followup_test
2-back Work Mem	2480	4597	4902
Pic Vocab	2812	5240	5282
Flanker	2810	5235	5372
Pattern Speed	2804	5229	5351
Seq Memory	2809	5235	5368
Reading Recog	2809	5220	5269
Little Man	2808	5093	5458
Audi Verbal	2793	5182	5350
integration SSRT	1936	3725	4053

2.4.2 random forest tuning function

here we specified the 2nd layer model

random_forest_tuning <- function(resp,valid_list,baseline_test, followup_test){
rf_train_data <- valid_list[[resp]]
rf_test_baseline <- baseline_test[[resp]]
rf_test_followup <- followup_test[[resp]]
### to avoid repetition in coding, data frame lists are created with ID and without ID.

resp_var <- resp
##data recipe, assign "SUBJECTKEY" to ID (a class of variables neither belong to predictors nor response variable)
## do not scale the input data as 1000 and -1000 are put there on purpose
forest_rec <- recipe(as.formula(paste0(resp_var, "~ .")), 
                     data = rf_train_data[["NoID"]]) %>%
              step_dummy(all_nominal()) %>% 
              prep(training = rf_train_data[["NoID"]], retain = TRUE)
## mtry is the number of predictors to sample at each split
## min_n (the number of observations needed to keep splitting nodes)
tune_spec <-rand_forest(mtry = tune(),
                         trees = 1000,
                         min_n = tune()) %>%
            set_mode("regression") %>%
            set_engine("ranger")

tune_wf <- workflow() %>%
  add_recipe(forest_rec) %>%
  add_model(tune_spec)

set.seed(123456)

## nested data validation 
all_rs <- rsample::vfold_cv(data=rf_train_data[["ID"]],v = 10,repeats = 1)
## automate generate grid for hyperparameters
#doParallel::registerDoParallel()


rf_grid <- grid_regular(
  mtry(range = c(1, 12)), #12 is the highest number of predictors 
  min_n(range = c(1, 1000)),
  levels = c(mtry = 12, min_n = 101) #mtry = 10 so that it increases one step at a time 
)

tune_res <- tune_grid(
  tune_wf,
  resamples = all_rs,
  metrics = metric_set(rmse),
  grid = rf_grid
)

best_tune <- select_best(tune_res, metric = "rmse")

final_rf <- finalize_model(
  tune_spec,
  best_tune
)

forest_prep <- prep(forest_rec)

## this one fit the random forest model with the best tuned parameters

rf_fit <- final_rf %>%
  set_engine("ranger",importance="permutation") %>% 
  parsnip::fit(as.formula(paste0(resp_var, "~ .")),
  data = juice(forest_prep) 
  )

pred_baseline <- predict(rf_fit,new_data = rf_test_baseline[["NoID"]])
pred_followup <- predict(rf_fit,new_data = rf_test_followup[["NoID"]])
performance_baseline <- cor(pred_baseline$.pred,rf_test_baseline[["ID"]][[resp]])
performance_followup <- cor(pred_followup$.pred,rf_test_followup[["ID"]][[resp]])
best_tune <- best_tune %>% mutate(performance_baseline=performance_baseline,
                                  performance_followup=performance_followup)
result_list <- list("param_tune"=best_tune, 
                    "predicted_baseline"=tibble(SUBJECTKEY=rf_test_baseline[["ID"]]$SUBJECTKEY, 
                                                "pred_baseline"=pred_baseline$.pred),
                    "predicted_followup"=tibble(SUBJECTKEY=rf_test_followup[["ID"]]$SUBJECTKEY,
                                                "pred_followup"=pred_followup$.pred),
                    "model_fit"=rf_fit)

return(result_list)
}

2.4.3 random forest model fit for each cognitive task

Here we actually put all of the fitted results into the random forest. It is time-consuming and computing-expensive to run, so not run by default.

##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)

rf_fit_all <- resp_names %>% 
  future_map(.,~random_forest_tuning(resp=.,
                                     valid_list = enet_rf_valid_data,
                                     baseline_test = enet_rf_test_baseline,
                                     followup_test = enet_rf_test_followup),
             .options = furrr_options(seed=TRUE),.progress = TRUE)

2.4.4 print the tuned parameters and model performace of the random forest model for each cognitve task

random_fores_output_print <- 
  function(rf_output,enet_output,response_vec,names_plotting){
    
  summary_rf <- vector("list", length = length(response_vec))
  names(summary_rf) <- response_vec

  for(i in 1:length(response_vec)){
  summary_rf[[response_vec[i]]]<- as_tibble(rf_output[[response_vec[i]]][["param_tune"]]) %>% slice(1)
  }
  
  ## mtry is the number of predictors to sample at each split
  ## min_n (the number of observations needed to keep splitting nodes)
  
  rf_perform_output <- summary_rf %>% bind_rows() %>% 
    mutate(respones = names_plotting$short_name)%>%
    rename(., "#predictors/split"=mtry,
           "#observations/split"=min_n,
           "model#"=.config,
           "Baseline (Pearson r)"=performance_baseline,
           "Followup  (Pearson r)"=performance_followup) 
  
  rf_result_vec <- paste0(mri_vec,"_results")
  
  all_enet_performance <- response_vec %>% 
    map(.,function(resp){
      summary_mri_all <- vector("list", length = length(rf_result_vec))
      names(summary_mri_all)<-rf_result_vec 
  
      for(i in 1:length(rf_result_vec)){
        summary_mri_all[[rf_result_vec[i]]]<- 
          as_tibble(enet_output[[rf_result_vec[i]]][[resp]][["param_tune"]]) %>% slice(1)
      }
  
      summary_mri_all <- summary_mri_all %>% 
        bind_rows() %>% 
        mutate(type = mri_vec)
  
      return(summary_mri_all)
  })

summary_rf_all <- summary_rf %>% bind_rows() %>% mutate(response=response_vec)

all_enet_performance_baseline <- all_enet_performance %>% 
  do.call(rbind,.) %>%
  mutate(.,response = rep(response_vec, each=length(mri_vec))) %>%
  select(type,baseline_performance,response) %>% 
  pivot_wider(names_from = type, values_from= c(baseline_performance))


all_enet_baseline <-  
  plyr::join_all(list(all_enet_performance_baseline,names_plotting,summary_rf_all), 
                 type="left", by = "response") %>% 
  select(all_of(mri_vec),"short_name","performance_baseline")%>%
  rename(random_forest_baseline = performance_baseline)

all_enet_performance_followup <- all_enet_performance%>% do.call(rbind,.) %>%
  mutate(.,response = rep(response_vec, each=length(mri_vec))) %>%
  select(type,followup_performance,response) %>% 
  pivot_wider(names_from = type, values_from= c(followup_performance))

all_enet_followup <-  
  plyr::join_all(list(all_enet_performance_followup,names_plotting,summary_rf_all), 
                 type="left", by = "response") %>% 
  select(all_of(mri_vec),"short_name","performance_followup")%>%
rename(random_forest_followup = performance_followup)

return(list(rf_performance = rf_perform_output,rf_enet_baseline = all_enet_baseline,rf_enet_followup = all_enet_followup))
}

task_rf_results <- random_fores_output_print(rf_output = rf_fit_all,
                                             enet_output = enet_results_all,
                                             response_vec=resp_names,
                                             names_plotting = resp_var_plotting)

task_rf_results[["rf_performance"]]%>% 
  rename("Response Variable" = respones, "Baseline" = "Baseline (Pearson r)", "Followup" = "Followup  (Pearson r)") %>% 
  select("Response Variable", "Baseline","Followup", "#predictors/split", "#observations/split") %>%
  mutate(across(2:3, round, 3)) %>% 
  kableExtra::kbl(caption = paste0("Performance (Pearson r) and Hyperparameters of Random Forest Stacking for Each Cognitive Task")) %>%
  kableExtra::kable_classic(full_width = F, html_font = "Cambria")

Performance (Pearson r) and Hyperparameters of Random Forest Stacking for Each Cognitive Task
Response Variable	Baseline	Followup	#predictors/split	#observations/split
2-back Work Mem	0.478	0.417	3	140
Pic Vocab	0.364	0.379	4	150
Flanker	0.183	0.199	4	490
Pattern Speed	0.193	0.189	2	420
Seq Memory	0.237	0.247	4	410
Reading Recog	0.311	0.311	12	280
Little Man	0.269	0.264	3	270
Audi Verbal	0.239	0.182	3	590
integration SSRT	0.292	0.333	6	350

task_rf_results[["rf_enet_baseline"]] %>% 
  rename("Response Variable" = short_name,
         "NBack-fMRI" = Nback,
         "SST-fMRI" = SST,
         "MID-fMRI" = MID,
         "rs-fMRI" = rsmri,
         "sMRI" = smri,
         "Stacked" = random_forest_baseline) %>% 
  select("Response Variable", 
         "Stacked",
         "NBack-fMRI",
         "MID-fMRI",
         "SST-fMRI",
         "rs-fMRI",
         "sMRI",
         "DTI") %>%
  mutate(across(2:8, round, 3)) %>% 
  kableExtra::kbl(caption = paste0("Performance of Random Forest Stacking VS Elastic Net on Each Modality for Each Cognitive Task\n at the Baseline (Pearson r)")) %>%
  kableExtra::kable_classic(full_width = F, html_font = "Cambria")

Performance of Random Forest Stacking VS Elastic Net on Each Modality for Each Cognitive Task at the Baseline (Pearson r)
Response Variable	Stacked	NBack-fMRI	MID-fMRI	SST-fMRI	rs-fMRI	sMRI	DTI
2-back Work Mem	0.478	0.492	0.154	0.175	0.125	0.140	0.130
Pic Vocab	0.364	0.318	0.174	0.124	0.213	0.184	0.178
Flanker	0.183	0.199	0.065	0.018	0.057	0.033	0.079
Pattern Speed	0.193	0.182	0.080	0.032	0.087	0.090	0.121
Seq Memory	0.237	0.210	0.102	NA	0.095	0.112	0.120
Reading Recog	0.311	0.289	0.157	0.102	0.189	0.166	0.138
Little Man	0.269	0.243	0.079	0.095	0.114	0.131	0.116
Audi Verbal	0.239	0.151	0.093	0.076	0.124	0.094	0.098
integration SSRT	0.292	0.122	0.015	0.328	0.031	0.067	0.086

task_rf_results[["rf_enet_followup"]] %>%
    rename("Response Variable" = short_name,
         "NBack-fMRI" = Nback,
         "SST-fMRI" = SST,
         "MID-fMRI" = MID,
         "rs-fMRI" = rsmri,
         "sMRI" = smri,
         "Stacked" = random_forest_followup) %>% 
  select("Response Variable", 
         "Stacked",
         "NBack-fMRI",
         "MID-fMRI",
         "SST-fMRI",
         "rs-fMRI",
         "sMRI",
         "DTI") %>%
  mutate(across(2:8, round, 3)) %>% 
  kableExtra::kbl(caption = paste0("Performance of Random Forest Stacking VS Elastic Net on Each Modality for Each Cognitive Task\n at the Follow-Up (Pearson r)")) %>%
  kableExtra::kable_classic(full_width = F, html_font = "Cambria")

Performance of Random Forest Stacking VS Elastic Net on Each Modality for Each Cognitive Task at the Follow-Up (Pearson r)
Response Variable	Stacked	NBack-fMRI	MID-fMRI	SST-fMRI	rs-fMRI	sMRI	DTI
2-back Work Mem	0.417	0.480	0.132	0.195	0.185	0.177	0.199
Pic Vocab	0.379	0.298	0.128	0.147	0.251	0.226	0.190
Flanker	0.199	0.174	0.044	0.050	0.093	0.058	0.141
Pattern Speed	0.189	0.193	0.063	0.048	0.090	0.102	0.093
Seq Memory	0.247	0.228	0.098	NA	0.127	0.128	0.169
Reading Recog	0.311	0.284	0.126	0.092	0.205	0.174	0.151
Little Man	0.264	0.237	0.058	0.082	0.097	0.118	0.246
Audi Verbal	0.182	0.156	0.046	0.080	0.126	0.091	0.062
integration SSRT	0.333	0.160	0.017	0.371	0.086	0.077	0.120

2.4.5 Plot non-conditional permutated variable importance of the random forest model

the results should be treated with caution as the opportunistic stacking includes inherently colineared predictors.

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

resp_names%>% map(.,~rf_fit_all[[.]][["model_fit"]] %>%
  vip(geom = "point"))

## $TFMRI_NB_ALL_BEH_C2B_RATE

## 
## $NIHTBX_PICVOCAB_UNCORRECTED

## 
## $NIHTBX_FLANKER_UNCORRECTED

## 
## $NIHTBX_PATTERN_UNCORRECTED

## 
## $NIHTBX_PICTURE_UNCORRECTED

## 
## $NIHTBX_READING_UNCORRECTED

## 
## $LMT_SCR_PERC_CORRECT

## 
## $PEA_RAVLT_LD_TRIAL_VII_TC

## 
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT

2.4.6 Plot conditional permutated variable importance of the random forest stacking model

enet_rf_valid_data_permu <- enet_rf_valid_data %>% map(.,function(data_input=.){
 data_output<- data_input[["NoID"]] %>%
   dplyr::rename("MID_large"="MID_pred.x",
                 "SST_large"="SST_pred.x",
                 "smri_large"="smri_pred.x",
                 "rsmri_large"="rsmri_pred.x",
                 "DTI_large"="DTI_pred.x",
                 "Nback_large"="Nback_pred.x",
                 "MID_small"="MID_pred.y",
                 "SST_small"="SST_pred.y",
                 "smri_small"="smri_pred.y",
                 "rsmri_small"="rsmri_pred.y",
                 "DTI_small"="DTI_pred.y",
                 "Nback_small"="Nback_pred.y")
 return(data_output)
  })


cf_fit <- resp_names %>% map(.,function(resp_var=.){
  cforest(as.formula(paste0(resp_var, "~ .")),
          data=enet_rf_valid_data_permu[[resp_var]],
          control= cforest_unbiased(mtry=rf_fit_all[[resp_var]][["param_tune"]]$mtry,
                                    ntree=1000,minsplit=rf_fit_all[[resp_var]][["param_tune"]]$min_n))
}) 

PI_permimp <-cf_fit %>% map(.,~permimp(.,progressBar = FALSE)) 

resp_names %>% map(.,~plot(PI_permimp[[.]], 
                           type = "box", 
                           horizontal = TRUE,
                           main=paste0("Conditional Permutation Importance ",
                                       resp_var_plotting$short_name[[which(resp_var_plotting$response==.)]])))

## $TFMRI_NB_ALL_BEH_C2B_RATE
## NULL
## 
## $NIHTBX_PICVOCAB_UNCORRECTED
## NULL
## 
## $NIHTBX_FLANKER_UNCORRECTED
## NULL
## 
## $NIHTBX_PATTERN_UNCORRECTED
## NULL
## 
## $NIHTBX_PICTURE_UNCORRECTED
## NULL
## 
## $NIHTBX_READING_UNCORRECTED
## NULL
## 
## $LMT_SCR_PERC_CORRECT
## NULL
## 
## $PEA_RAVLT_LD_TRIAL_VII_TC
## NULL
## 
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT
## NULL

resp_names %>% map(.,~plot(PI_permimp[[.]], 
                           type = "bar", 
                           interval="quantile",
                           main=paste0("Conditional Permutation Importance ",
                                       resp_var_plotting$short_name[[which(resp_var_plotting$response==.)]])))

## $TFMRI_NB_ALL_BEH_C2B_RATE
## NULL
## 
## $NIHTBX_PICVOCAB_UNCORRECTED
## NULL
## 
## $NIHTBX_FLANKER_UNCORRECTED
## NULL
## 
## $NIHTBX_PATTERN_UNCORRECTED
## NULL
## 
## $NIHTBX_PICTURE_UNCORRECTED
## NULL
## 
## $NIHTBX_READING_UNCORRECTED
## NULL
## 
## $LMT_SCR_PERC_CORRECT
## NULL
## 
## $PEA_RAVLT_LD_TRIAL_VII_TC
## NULL
## 
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT
## NULL

2.4.6.1 Plot obverved against predicted value for the Random Forest Stacking model for each task

random_forest_plotting_followup <- function(rf_fit, followup_data, resp_vec, naming_frame ){
  followup_rf_pred <- resp_vec %>% map(.,function(resp_var=.){
  resp_pred <- tibble(pred = rf_fit[[resp_var]][["predicted_followup"]][["pred_followup"]],
                      observed=followup_data[[resp_var]][["ID"]][[resp_var]],
                      SUBJECTKEY=followup_data[[resp_var]][["ID"]][["SUBJECTKEY"]])
  return(resp_pred)
})

task_followup_pred <- resp_vec %>% 
  map(., ~ggplot(followup_rf_pred[[.]],
                 aes(x = scale(pred) , 
                     y = scale(observed))) +
  geom_jitter(height = 0.1, width = 0.1, size = 1, col = 'grey60') +
  geom_smooth(method = 'lm', se = FALSE, col = 'black')  +
  labs(x = NULL,
       y = NULL,
       title = paste (naming_frame$short_name[[which(naming_frame$response==.)]],
                      '\nr = ',round(rf_fit[[.]][["param_tune"]][["performance_followup"]], 3))) 
                     )
  title_task_rf_pred_plot <- ggdraw() + 
  draw_label(
    "Out-of-Sample Random Forest Stacking Predictive Ability at Followup",
    fontface = 'bold',
    x = 0,
    hjust = 0
  ) +
  theme(
    # add margin on the left of the drawing canvas,
    # so title is aligned with left edge of first plot
    plot.margin = margin(0, 0, 0, 7)
  )
  followup_pred_all_figure<- plot_grid(title_task_rf_pred_plot,plot_grid(plotlist = task_followup_pred),nrow = 2 , rel_heights = c(0.1, 1))

  ggpubr::annotate_figure(followup_pred_all_figure,left= ggpubr::text_grob("Observed Cognitive Performance (Z)",size=15,rot=90),
                        bottom = ggpubr::text_grob("Predicted Cognitive Performance (Z)",size=15))
}

random_forest_plotting_baseline <- function(rf_fit, baseline_data, resp_vec,naming_frame ){
 baseline_rf_pred <- resp_vec %>% map(.,function(resp_var=.){
  resp_pred <- tibble(pred = rf_fit[[resp_var]][["predicted_baseline"]][["pred_baseline"]],
                                 observed=baseline_data[[resp_var]][["ID"]][[resp_var]],
                                 SUBJECTKEY=baseline_data[[resp_var]][["ID"]][["SUBJECTKEY"]])
  return(resp_pred)
})

task_baseline_pred <- resp_vec %>% map(., ~ggplot(baseline_rf_pred[[.]],aes(x = scale(pred) , y = scale(observed))) +
  geom_jitter(height = 0.1, width = 0.1, size = 1, col = 'grey60') +
  geom_smooth(method = 'lm', se = FALSE, col = 'black')  +
  labs(x = NULL,
       y = NULL,
       title = paste (naming_frame$short_name[[which(naming_frame$response==.)]],'\nr = ',round(rf_fit[[.]][["param_tune"]][["performance_baseline"]], 3))) 
                     )
  title_task_rf_pred_plot_baseline <- ggdraw() + 
  draw_label(
    "Out-of-Sample Random Forest Stacking Predictive Ability at Baseline",
    fontface = 'bold',
    x = 0,
    hjust = 0
  ) +
  theme(
    # add margin on the left of the drawing canvas,
    # so title is aligned with left edge of first plot
    plot.margin = margin(0, 0, 0, 7)
  )
  baseline_pred_all_figure<- plot_grid(title_task_rf_pred_plot_baseline,
                                       plot_grid(plotlist = task_baseline_pred),
                                       nrow = 2 , 
                                       rel_heights = c(0.1, 1))

  ggpubr::annotate_figure(baseline_pred_all_figure,
                          left= ggpubr::text_grob("Observed Cognitive Performance (Z)",size=15,rot=90),
                        bottom = ggpubr::text_grob("Predicted Cognitive Performance (Z)",size=15)) 
}


random_forest_plotting_followup(rf_fit =rf_fit_all,followup_data=enet_rf_test_followup,naming_frame = resp_var_plotting, resp_vec= resp_names)

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

random_forest_plotting_baseline(rf_fit =rf_fit_all,baseline_data=enet_rf_test_baseline,naming_frame = resp_var_plotting , resp_vec= resp_names)

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

3 Relationship between MRI and G-factor: Confirmatory factor analysis

here cfa is used to compute g factor.
We fit two models (first order and second order models) onto the training data, and apply the fit to the validation (2nd-layer) and test sets (baseline and followup). This is to protect us from dataleakage.

3.1 set up CFA

# we only inlcuded task that are avaliable on two time points

TaskDVs1Batch = c("NIHTBX_PICVOCAB_UNCORRECTED", 
                  "NIHTBX_READING_UNCORRECTED",
              "NIHTBX_FLANKER_UNCORRECTED",
              "NIHTBX_PATTERN_UNCORRECTED",
              "NIHTBX_PICTURE_UNCORRECTED",
               "PEA_RAVLT_LD_TRIAL_VII_TC"
)


CFA_lav_output_processing <- function(data_input,CFA_model1,CFA_model2,CFA_model3){
first_order_output <- lavaan::lavPredict(CFA_model1, newdata =data_input )
second_order_output <- lavaan::lavPredict(CFA_model2, newdata = data_input)
single_factor_output <- lavaan::lavPredict(CFA_model3, newdata = data_input)

### getting SUBJECTKEY for CFA model prediction
### the number of columns of CFA predictions and task performance data matched
task_data <- select(data_input,all_of(TaskDVs1Batch),"SUBJECTKEY")%>% drop_na()

CFA_lav <- tibble(SUBJECTKEY =task_data$SUBJECTKEY, 
                  Lang_first_order = first_order_output[,1], 
                  CogFlex_first_order = first_order_output[,2], 
                  MemRe_first_order = first_order_output[,3],
                  g_second_order =second_order_output[,4],
                  g_single_factor =single_factor_output[,1])
return(CFA_lav)
}

CFA_lav_response <- function(data_input = baseline_two_fold$splits[[1]], 
                             test_baseline =baseline_test, 
                             test_followup=followup_analysis ){
  ## training the model and for enet parameter tuning
data_train <- analysis(data_input)%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
## random forest tuning
data_valid <- assessment(data_input)%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
##  test set for baseline observations
data_baseline <- test_baseline%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
##  test set for followup observations
data_followup <- test_followup%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
### first order CFA model

NeuroCog1stOrder <-'
Language =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED 
CognitiveFlexibity =~ NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED 
MemoryRecall =~ NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC'

NeuroCog1stOrder.Fit <- lavaan::cfa(model = NeuroCog1stOrder, data = data_train,estimator="MLR")

### second order CFA model

NeuroCog2ndOrder <-'
Language =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED 
CognitiveFlexibity =~ NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED 
MemoryRecall =~ NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC
g =~ NA*Language + CognitiveFlexibity  + MemoryRecall #estimate the loading of GenAbi -> as opposed to using it as a marker
g ~~ 1*g #need to constrain variance to 1'

NeuroCog2ndOrder.Fit <- lavaan::cfa(model = NeuroCog2ndOrder, data = data_train,estimator="MLR")

### single factor CFA model

NeuroCogSingleFactor <-'
g =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED + NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED + NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC'

NeuroCogSingleFactor.Fit <- lavaan::cfa(model = NeuroCogSingleFactor, data = data_train,estimator="MLR")

train_CFA_lav <- CFA_lav_output_processing(data_input = data_train,
                                           CFA_model1 = NeuroCog1stOrder.Fit,
                                           CFA_model2 = NeuroCog2ndOrder.Fit,
                                           CFA_model3 = NeuroCogSingleFactor.Fit)
valid_CFA_lav <- CFA_lav_output_processing(data_input = data_valid,
                                           CFA_model1 = NeuroCog1stOrder.Fit,
                                           CFA_model2 = NeuroCog2ndOrder.Fit,
                                           CFA_model3 = NeuroCogSingleFactor.Fit)
baseline_CFA_lav <- CFA_lav_output_processing(data_input = data_baseline,
                                              CFA_model1 = NeuroCog1stOrder.Fit,
                                              CFA_model2 = NeuroCog2ndOrder.Fit,
                                              CFA_model3 = NeuroCogSingleFactor.Fit)
followup_CFA_lav <- CFA_lav_output_processing(data_input = data_followup,
                                              CFA_model1 = NeuroCog1stOrder.Fit,
                                              CFA_model2 = NeuroCog2ndOrder.Fit,
                                              CFA_model3 = NeuroCogSingleFactor.Fit)

output_list <- list(enet_tuning = train_CFA_lav, 
                    RanFor_tuning = valid_CFA_lav, 
                    testing_baseline = baseline_CFA_lav,
                    testing_followup=followup_CFA_lav)

return(list(output_list, NeuroCog1stOrder.Fit, NeuroCog2ndOrder.Fit, NeuroCogSingleFactor.Fit))
}

CFA_list <- CFA_lav_response(data_input = baseline_two_fold$splits[[1]], 
                             test_baseline =baseline_test, 
                             test_followup=followup_analysis)
CFA_response <- CFA_list[[1]]
NeuroCog1stOrder.Fit <- CFA_list[[2]]
NeuroCog2ndOrder.Fit <- CFA_list[[3]]
NeuroCogSingleFactor.Fit <- CFA_list[[4]]

3.1.1 find the number of EFA factors using parallel analysis

the result is 3

data_train_fa <- analysis(baseline_two_fold$splits[[1]])%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()

data_train_fa %>%  select(all_of(TaskDVs1Batch)) %>%
  psych::fa.parallel(fm='ml', fa='fa')

## Parallel analysis suggests that the number of factors =  3  and the number of components =  NA

3.1.2 fitting Oblimin FA

Task.faObli <- 
  data_train_fa %>%  
  select(all_of(TaskDVs1Batch)) %>%
  psych::fa(nfactors = 3, #number of factors
            rotate = "oblimin", #rotation type
            fm = "ml", #math
            score = T)

## Loading required namespace: GPArotation

print(Task.faObli$loadings, 
      #cut =.3, 
      sort=TRUE)

## 
## Loadings:
##                             ML1    ML3    ML2   
## NIHTBX_PICVOCAB_UNCORRECTED  0.702              
## NIHTBX_READING_UNCORRECTED   0.730              
## NIHTBX_PICTURE_UNCORRECTED          0.688       
## PEA_RAVLT_LD_TRIAL_VII_TC    0.129  0.575       
## NIHTBX_FLANKER_UNCORRECTED   0.147         0.534
## NIHTBX_PATTERN_UNCORRECTED                 0.679
## 
##                  ML1   ML3   ML2
## SS loadings    1.070 0.805 0.750
## Proportion Var 0.178 0.134 0.125
## Cumulative Var 0.178 0.313 0.437

Task.faObli

## Factor Analysis using method =  ml
## Call: psych::fa(r = ., nfactors = 3, rotate = "oblimin", scores = T, 
##     fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                               ML1   ML3   ML2   h2   u2 com
## NIHTBX_PICVOCAB_UNCORRECTED  0.70  0.01  0.01 0.51 0.49 1.0
## NIHTBX_READING_UNCORRECTED   0.73  0.01  0.01 0.55 0.45 1.0
## NIHTBX_FLANKER_UNCORRECTED   0.15 -0.01  0.53 0.37 0.63 1.2
## NIHTBX_PATTERN_UNCORRECTED  -0.05  0.02  0.68 0.44 0.56 1.0
## NIHTBX_PICTURE_UNCORRECTED  -0.06  0.69  0.04 0.45 0.55 1.0
## PEA_RAVLT_LD_TRIAL_VII_TC    0.13  0.58 -0.04 0.41 0.59 1.1
## 
##                        ML1  ML3  ML2
## SS loadings           1.12 0.84 0.78
## Proportion Var        0.19 0.14 0.13
## Cumulative Var        0.19 0.33 0.46
## Proportion Explained  0.41 0.31 0.28
## Cumulative Proportion 0.41 0.72 1.00
## 
##  With factor correlations of 
##      ML1  ML3  ML2
## ML1 1.00 0.58 0.43
## ML3 0.58 1.00 0.42
## ML2 0.43 0.42 1.00
## 
## Mean item complexity =  1
## Test of the hypothesis that 3 factors are sufficient.
## 
## The degrees of freedom for the null model are  15  and the objective function was  1.02 with Chi Square of  2945.66
## The degrees of freedom for the model are 0  and the objective function was  0 
## 
## The root mean square of the residuals (RMSR) is  0 
## The df corrected root mean square of the residuals is  NA 
## 
## The harmonic number of observations is  2902 with the empirical chi square  0.01  with prob <  NA 
## The total number of observations was  2902  with Likelihood Chi Square =  0.01  with prob <  NA 
## 
## Tucker Lewis Index of factoring reliability =  -Inf
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    ML1  ML3  ML2
## Correlation of (regression) scores with factors   0.85 0.80 0.78
## Multiple R square of scores with factors          0.73 0.65 0.60
## Minimum correlation of possible factor scores     0.45 0.30 0.21

3.1.3 prep data for the EFA -> CFA model

## training the model and for enet parameter tuning
data_train <- analysis(baseline_two_fold$splits[[1]])%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
## random forest tuning
data_valid <- assessment(baseline_two_fold$splits[[1]])%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
##  test set for baseline observations
data_baseline <- baseline_test%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
##  test set for followup observations
data_followup <- followup_analysis%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()

3.1.4 build EFA -> CFA model using data_train

Task.faObli_data_train_pred <- psych::predict.psych(object = Task.faObli,
        data = data_train %>% select(all_of(TaskDVs1Batch)),
        old.data = data_train %>% select(all_of(TaskDVs1Batch))) %>%
        as_tibble() %>%
        rename(Language_EFA = ML1, 
               CognitiveFlexibity_EFA = ML2, 
               MemoryRecall_EFA = ML3)

NeuroCogEFA_CFA <-'
g =~ Language_EFA + MemoryRecall_EFA + CognitiveFlexibity_EFA'

NeuroCogEFA_CFA.Fit <- lavaan::cfa(model = NeuroCogEFA_CFA, 
                                   data = Task.faObli_data_train_pred,
                                   estimator="MLR")

CFA_response$enet_tuning <- 
  CFA_response$enet_tuning %>% bind_cols(
    lavaan::lavPredict(NeuroCogEFA_CFA.Fit, 
                       newdata =Task.faObli_data_train_pred)%>%
      as.double() %>% as_tibble() %>%
    rename(g_EFA_CFA = value))

3.1.5 apply the EFA -> CFA model to data valid, data baseline, data followup

## data valid
Task.faObli_data_valid_pred <- psych::predict.psych(object = Task.faObli,
        data = data_valid %>% select(all_of(TaskDVs1Batch)),
        old.data = data_train %>% select(all_of(TaskDVs1Batch))) %>%
        as_tibble() %>%
        rename(Language_EFA = ML1, 
               CognitiveFlexibity_EFA = ML2, 
               MemoryRecall_EFA = ML3)

CFA_response$RanFor_tuning <- 
  CFA_response$RanFor_tuning %>% bind_cols(
    lavaan::lavPredict(NeuroCogEFA_CFA.Fit, 
                       newdata =Task.faObli_data_valid_pred)%>%
    as.double() %>% as_tibble() %>%
    rename(g_EFA_CFA = value))

## data_baseline
Task.faObli_data_baseline_pred <- psych::predict.psych(object = Task.faObli,
        data = data_baseline %>% select(all_of(TaskDVs1Batch)),
        old.data = data_train %>% select(all_of(TaskDVs1Batch))) %>%
        as_tibble() %>%
        rename(Language_EFA = ML1, 
               CognitiveFlexibity_EFA = ML2, 
               MemoryRecall_EFA = ML3)

CFA_response$testing_baseline <- 
  CFA_response$testing_baseline %>% bind_cols(
    lavaan::lavPredict(NeuroCogEFA_CFA.Fit, 
                       newdata =Task.faObli_data_baseline_pred)%>%
    as.double() %>% as_tibble() %>%
    rename(g_EFA_CFA = value))

## data_follow up
Task.faObli_data_followup_pred <- psych::predict.psych(object = Task.faObli,
        data = data_followup %>% select(all_of(TaskDVs1Batch)),
        old.data = data_train %>% select(all_of(TaskDVs1Batch))) %>%
        as_tibble() %>%
        rename(Language_EFA = ML1, 
               CognitiveFlexibity_EFA = ML2, 
               MemoryRecall_EFA = ML3)

CFA_response$testing_followup <- 
  CFA_response$testing_followup %>% bind_cols(
    lavaan::lavPredict(NeuroCogEFA_CFA.Fit, 
                       newdata =Task.faObli_data_followup_pred)%>%
    as.double() %>% as_tibble() %>%
    rename(g_EFA_CFA = value))

3.1.6 look at the final EFA CFA model

lavaan::summary(NeuroCogEFA_CFA.Fit, standardized = TRUE, rsquare = TRUE, fit.measures = TRUE)

## lavaan 0.6-11 ended normally after 19 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                         6
##                                                       
##   Number of observations                          2902
##                                                       
## Model Test User Model:
##                                               Standard      Robust
##   Test Statistic                                 0.000       0.000
##   Degrees of freedom                                 0           0
## 
## Model Test Baseline Model:
## 
##   Test statistic                              3882.254    3837.255
##   Degrees of freedom                                 3           3
##   P-value                                        0.000       0.000
##   Scaling correction factor                                  1.012
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    1.000       1.000
##   Tucker-Lewis Index (TLI)                       1.000       1.000
##                                                                   
##   Robust Comparative Fit Index (CFI)                         1.000
##   Robust Tucker-Lewis Index (TLI)                            1.000
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)              -8584.296   -8584.296
##   Loglikelihood unrestricted model (H1)      -8584.296   -8584.296
##                                                                   
##   Akaike (AIC)                               17180.591   17180.591
##   Bayesian (BIC)                             17216.430   17216.430
##   Sample-size adjusted Bayesian (BIC)        17197.366   17197.366
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.000       0.000
##   90 Percent confidence interval - lower         0.000       0.000
##   90 Percent confidence interval - upper         0.000       0.000
##   P-value RMSEA <= 0.05                             NA          NA
##                                                                   
##   Robust RMSEA                                               0.000
##   90 Percent confidence interval - lower                     0.000
##   90 Percent confidence interval - upper                     0.000
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.000       0.000
## 
## Parameter Estimates:
## 
##   Standard errors                             Sandwich
##   Information bread                           Observed
##   Observed information based on                Hessian
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   g =~                                                                  
##     Language_EFA      1.000                               0.741    0.870
##     MemoryRcll_EFA    0.933    0.021   44.922    0.000    0.692    0.860
##     CgntvFlxbt_EFA    0.720    0.019   37.065    0.000    0.534    0.687
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .Language_EFA      0.177    0.011   16.639    0.000    0.177    0.244
##    .MemoryRcll_EFA    0.169    0.009   17.908    0.000    0.169    0.261
##    .CgntvFlxbt_EFA    0.318    0.010   32.873    0.000    0.318    0.528
##     g                 0.550    0.022   25.501    0.000    1.000    1.000
## 
## R-Square:
##                    Estimate
##     Language_EFA      0.756
##     MemoryRcll_EFA    0.739
##     CgntvFlxbt_EFA    0.472

Plabels = c("Language","Memory\nRecall","Mental\nFlexibity",expression(paste(italic("g"))))
semPlot::semPaths(object=NeuroCogEFA_CFA.Fit,
                  intercepts = F, residuals = F,
                  whatLabels="no",
                  what = "std",
                  layout="tree",
                  node.width = 1.4,
                  edge.label.cex = .6,
                  nodeLabels=Plabels,
                  edge.color="black",
                  sizeMan = 10, sizeLat = 10)

#semPlot::semPaths(NeuroCogEFA_CFA.Fit)

3.1.7 plot the EFA part

colnames(Task.faObli$loadings) <- c("Language","Memory\nRecall","Mental\nFlexibity")
rownames(Task.faObli$loadings) <- 
  c("Vocab","Reading","Flanker","Pattern","Picture","RAVLT")
psych::fa.diagram(Task.faObli)

Task.faObli

## Factor Analysis using method =  ml
## Call: psych::fa(r = ., nfactors = 3, rotate = "oblimin", scores = T, 
##     fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##         Language Memory\nRecall Mental\nFlexibity   h2   u2 com
## Vocab       0.70           0.01              0.01 0.51 0.49 1.0
## Reading     0.73           0.01              0.01 0.55 0.45 1.0
## Flanker     0.15          -0.01              0.53 0.37 0.63 1.2
## Pattern    -0.05           0.02              0.68 0.44 0.56 1.0
## Picture    -0.06           0.69              0.04 0.45 0.55 1.0
## RAVLT       0.13           0.58             -0.04 0.41 0.59 1.1
## 
##                       Language Memory\nRecall Mental\nFlexibity
## SS loadings               1.12           0.84              0.78
## Proportion Var            0.19           0.14              0.13
## Cumulative Var            0.19           0.33              0.46
## Proportion Explained      0.41           0.31              0.28
## Cumulative Proportion     0.41           0.72              1.00
## 
##  With factor correlations of 
##                   Language Memory\nRecall Mental\nFlexibity
## Language              1.00           0.58              0.43
## Memory\nRecall        0.58           1.00              0.42
## Mental\nFlexibity     0.43           0.42              1.00
## 
## Mean item complexity =  1
## Test of the hypothesis that 3 factors are sufficient.
## 
## The degrees of freedom for the null model are  15  and the objective function was  1.02 with Chi Square of  2945.66
## The degrees of freedom for the model are 0  and the objective function was  0 
## 
## The root mean square of the residuals (RMSR) is  0 
## The df corrected root mean square of the residuals is  NA 
## 
## The harmonic number of observations is  2902 with the empirical chi square  0.01  with prob <  NA 
## The total number of observations was  2902  with Likelihood Chi Square =  0.01  with prob <  NA 
## 
## Tucker Lewis Index of factoring reliability =  -Inf
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                   Language Memory\nRecall
## Correlation of (regression) scores with factors       0.85           0.80
## Multiple R square of scores with factors              0.73           0.65
## Minimum correlation of possible factor scores         0.45           0.30
##                                                   Mental\nFlexibity
## Correlation of (regression) scores with factors                0.78
## Multiple R square of scores with factors                       0.60
## Minimum correlation of possible factor scores                  0.21

3.1.8 plot and provide sum stats of the 1st order model

lavaan::summary(NeuroCog1stOrder.Fit, standardized = TRUE, rsquare = TRUE, fit.measures = TRUE)

## lavaan 0.6-11 ended normally after 31 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        15
##                                                       
##   Number of observations                          2902
##                                                       
## Model Test User Model:
##                                                Standard      Robust
##   Test Statistic                                 20.462      20.029
##   Degrees of freedom                                  6           6
##   P-value (Chi-square)                            0.002       0.003
##   Scaling correction factor                                   1.022
##        Yuan-Bentler correction (Mplus variant)                     
## 
## Model Test Baseline Model:
## 
##   Test statistic                              2949.557    2772.060
##   Degrees of freedom                                15          15
##   P-value                                        0.000       0.000
##   Scaling correction factor                                  1.064
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.995       0.995
##   Tucker-Lewis Index (TLI)                       0.988       0.987
##                                                                   
##   Robust Comparative Fit Index (CFI)                         0.995
##   Robust Tucker-Lewis Index (TLI)                            0.988
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)             -23239.010  -23239.010
##   Scaling correction factor                                  1.121
##       for the MLR correction                                      
##   Loglikelihood unrestricted model (H1)     -23228.779  -23228.779
##   Scaling correction factor                                  1.093
##       for the MLR correction                                      
##                                                                   
##   Akaike (AIC)                               46508.020   46508.020
##   Bayesian (BIC)                             46597.617   46597.617
##   Sample-size adjusted Bayesian (BIC)        46549.957   46549.957
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.029       0.028
##   90 Percent confidence interval - lower         0.016       0.015
##   90 Percent confidence interval - upper         0.043       0.042
##   P-value RMSEA <= 0.05                          0.994       0.996
##                                                                   
##   Robust RMSEA                                               0.029
##   90 Percent confidence interval - lower                     0.015
##   90 Percent confidence interval - upper                     0.043
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.014       0.014
## 
## Parameter Estimates:
## 
##   Standard errors                             Sandwich
##   Information bread                           Observed
##   Observed information based on                Hessian
## 
## Latent Variables:
##                         Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   Language =~                                                                
##     NIHTBX_PICVOCA         1.000                               0.712    0.712
##     NIHTBX_READING         1.042    0.048   21.683    0.000    0.742    0.742
##   CognitiveFlexibity =~                                                      
##     NIHTBX_FLANKER         1.000                               0.721    0.721
##     NIHTBX_PATTERN         0.749    0.058   12.814    0.000    0.541    0.541
##   MemoryRecall =~                                                            
##     NIHTBX_PICTURE         1.000                               0.603    0.603
##     PEA_RAVLT_LD_T         1.152    0.068   16.872    0.000    0.694    0.695
## 
## Covariances:
##                         Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   Language ~~                                                                
##     CognitivFlxbty         0.266    0.019   14.102    0.000    0.518    0.518
##     MemoryRecall           0.272    0.017   15.811    0.000    0.633    0.633
##   CognitiveFlexibity ~~                                                      
##     MemoryRecall           0.193    0.017   11.130    0.000    0.443    0.443
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .NIHTBX_PICVOCA    0.493    0.025   19.590    0.000    0.493    0.493
##    .NIHTBX_READING    0.449    0.026   17.058    0.000    0.449    0.449
##    .NIHTBX_FLANKER    0.479    0.040   12.110    0.000    0.479    0.479
##    .NIHTBX_PATTERN    0.707    0.030   23.896    0.000    0.707    0.708
##    .NIHTBX_PICTURE    0.637    0.026   24.382    0.000    0.637    0.637
##    .PEA_RAVLT_LD_T    0.517    0.032   16.296    0.000    0.517    0.518
##     Language          0.507    0.031   16.267    0.000    1.000    1.000
##     CognitivFlxbty    0.520    0.047   11.106    0.000    1.000    1.000
##     MemoryRecall      0.363    0.028   12.876    0.000    1.000    1.000
## 
## R-Square:
##                    Estimate
##     NIHTBX_PICVOCA    0.507
##     NIHTBX_READING    0.551
##     NIHTBX_FLANKER    0.521
##     NIHTBX_PATTERN    0.292
##     NIHTBX_PICTURE    0.363
##     PEA_RAVLT_LD_T    0.482

Plabels = c("Vocab","Reading","Flanker","Pattern","Picture","RAVLT","Language","Mental\nFlexibity","Memory\nRecall")
semPlot::semPaths(object=NeuroCog1stOrder.Fit,
                  intercepts = F, residuals = F, 
                  whatLabels="no", 
                  what = "std", 
                  layout="tree",
                  node.width = 1.4,
                  edge.label.cex = .6, 
                  nodeLabels=Plabels, 
                  edge.color="black",
                  sizeMan = 10, sizeLat = 10)

3.1.9 plot and provide sum stats of the 2nd-order model

lavaan::summary(NeuroCog2ndOrder.Fit, standardized = TRUE, rsquare = TRUE, fit.measures = TRUE)

## lavaan 0.6-11 ended normally after 29 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        15
##                                                       
##   Number of observations                          2902
##                                                       
## Model Test User Model:
##                                                Standard      Robust
##   Test Statistic                                 20.462      20.029
##   Degrees of freedom                                  6           6
##   P-value (Chi-square)                            0.002       0.003
##   Scaling correction factor                                   1.022
##        Yuan-Bentler correction (Mplus variant)                     
## 
## Model Test Baseline Model:
## 
##   Test statistic                              2949.557    2772.060
##   Degrees of freedom                                15          15
##   P-value                                        0.000       0.000
##   Scaling correction factor                                  1.064
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.995       0.995
##   Tucker-Lewis Index (TLI)                       0.988       0.987
##                                                                   
##   Robust Comparative Fit Index (CFI)                         0.995
##   Robust Tucker-Lewis Index (TLI)                            0.988
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)             -23239.010  -23239.010
##   Scaling correction factor                                  1.121
##       for the MLR correction                                      
##   Loglikelihood unrestricted model (H1)     -23228.779  -23228.779
##   Scaling correction factor                                  1.093
##       for the MLR correction                                      
##                                                                   
##   Akaike (AIC)                               46508.020   46508.020
##   Bayesian (BIC)                             46597.617   46597.617
##   Sample-size adjusted Bayesian (BIC)        46549.957   46549.957
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.029       0.028
##   90 Percent confidence interval - lower         0.016       0.015
##   90 Percent confidence interval - upper         0.043       0.042
##   P-value RMSEA <= 0.05                          0.994       0.996
##                                                                   
##   Robust RMSEA                                               0.029
##   90 Percent confidence interval - lower                     0.015
##   90 Percent confidence interval - upper                     0.043
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.014       0.014
## 
## Parameter Estimates:
## 
##   Standard errors                             Sandwich
##   Information bread                           Observed
##   Observed information based on                Hessian
## 
## Latent Variables:
##                         Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   Language =~                                                                
##     NIHTBX_PICVOCA         1.000                               0.712    0.712
##     NIHTBX_READING         1.042    0.048   21.683    0.000    0.742    0.742
##   CognitiveFlexibity =~                                                      
##     NIHTBX_FLANKER         1.000                               0.721    0.721
##     NIHTBX_PATTERN         0.749    0.058   12.814    0.000    0.541    0.541
##   MemoryRecall =~                                                            
##     NIHTBX_PICTURE         1.000                               0.603    0.603
##     PEA_RAVLT_LD_T         1.152    0.068   16.872    0.000    0.694    0.695
##   g =~                                                                       
##     Language               0.613    0.029   20.816    0.000    0.860    0.860
##     CognitivFlxbty         0.434    0.025   17.213    0.000    0.602    0.602
##     MemoryRecall           0.444    0.027   16.494    0.000    0.736    0.736
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##     g                 1.000                               1.000    1.000
##    .NIHTBX_PICVOCA    0.493    0.025   19.590    0.000    0.493    0.493
##    .NIHTBX_READING    0.449    0.026   17.058    0.000    0.449    0.449
##    .NIHTBX_FLANKER    0.479    0.040   12.110    0.000    0.479    0.479
##    .NIHTBX_PATTERN    0.707    0.030   23.896    0.000    0.707    0.708
##    .NIHTBX_PICTURE    0.637    0.026   24.382    0.000    0.637    0.637
##    .PEA_RAVLT_LD_T    0.517    0.032   16.296    0.000    0.517    0.518
##    .Language          0.132    0.029    4.505    0.000    0.260    0.260
##    .CognitivFlxbty    0.332    0.039    8.535    0.000    0.637    0.637
##    .MemoryRecall      0.166    0.020    8.361    0.000    0.458    0.458
## 
## R-Square:
##                    Estimate
##     NIHTBX_PICVOCA    0.507
##     NIHTBX_READING    0.551
##     NIHTBX_FLANKER    0.521
##     NIHTBX_PATTERN    0.292
##     NIHTBX_PICTURE    0.363
##     PEA_RAVLT_LD_T    0.482
##     Language          0.740
##     CognitivFlxbty    0.363
##     MemoryRecall      0.542

Plabels = c("Vocab","Reading","Flanker","Pattern","Picture","RAVLT","Language","Mental\nFlexibity","Memory\nRecall",expression(paste(italic("g"))))
semPlot::semPaths(object=NeuroCog2ndOrder.Fit,intercepts = F, residuals = F, 
        whatLabels="no", what = "std", layout="tree", node.width = 1.4,
         edge.label.cex = .6, nodeLabels=Plabels, edge.color="black",
        sizeMan = 10, sizeLat = 10)

3.1.10 reliabity of the G-Factor

semTools::reliabilityL2(NeuroCog2ndOrder.Fit,"g")

##        omegaL1        omegaL2 partialOmegaL1 
##      0.6103282      0.7792178      0.7282842

3.1.11 plot and provide sum stats of the single factor model

lavaan::summary(NeuroCogSingleFactor.Fit, standardized = TRUE, rsquare = TRUE, fit.measures = TRUE)

## lavaan 0.6-11 ended normally after 20 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        12
##                                                       
##   Number of observations                          2902
##                                                       
## Model Test User Model:
##                                                Standard      Robust
##   Test Statistic                                513.874     510.350
##   Degrees of freedom                                  9           9
##   P-value (Chi-square)                            0.000       0.000
##   Scaling correction factor                                   1.007
##        Yuan-Bentler correction (Mplus variant)                     
## 
## Model Test Baseline Model:
## 
##   Test statistic                              2949.557    2772.060
##   Degrees of freedom                                15          15
##   P-value                                        0.000       0.000
##   Scaling correction factor                                  1.064
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.828       0.818
##   Tucker-Lewis Index (TLI)                       0.713       0.697
##                                                                   
##   Robust Comparative Fit Index (CFI)                         0.828
##   Robust Tucker-Lewis Index (TLI)                            0.713
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)             -23485.716  -23485.716
##   Scaling correction factor                                  1.157
##       for the MLR correction                                      
##   Loglikelihood unrestricted model (H1)     -23228.779  -23228.779
##   Scaling correction factor                                  1.093
##       for the MLR correction                                      
##                                                                   
##   Akaike (AIC)                               46995.431   46995.431
##   Bayesian (BIC)                             47067.109   47067.109
##   Sample-size adjusted Bayesian (BIC)        47028.981   47028.981
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.139       0.139
##   90 Percent confidence interval - lower         0.129       0.128
##   90 Percent confidence interval - upper         0.149       0.149
##   P-value RMSEA <= 0.05                          0.000       0.000
##                                                                   
##   Robust RMSEA                                               0.139
##   90 Percent confidence interval - lower                     0.129
##   90 Percent confidence interval - upper                     0.149
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.070       0.070
## 
## Parameter Estimates:
## 
##   Standard errors                             Sandwich
##   Information bread                           Observed
##   Observed information based on                Hessian
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   g =~                                                                  
##     NIHTBX_PICVOCA    1.000                               0.668    0.668
##     NIHTBX_READING    1.028    0.035   29.109    0.000    0.687    0.687
##     NIHTBX_FLANKER    0.670    0.045   14.852    0.000    0.447    0.447
##     NIHTBX_PATTERN    0.537    0.042   12.642    0.000    0.358    0.358
##     NIHTBX_PICTURE    0.706    0.046   15.506    0.000    0.472    0.472
##     PEA_RAVLT_LD_T    0.787    0.045   17.566    0.000    0.526    0.526
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .NIHTBX_PICVOCA    0.554    0.024   23.532    0.000    0.554    0.554
##    .NIHTBX_READING    0.528    0.025   21.063    0.000    0.528    0.528
##    .NIHTBX_FLANKER    0.800    0.026   30.533    0.000    0.800    0.800
##    .NIHTBX_PATTERN    0.871    0.025   34.266    0.000    0.871    0.872
##    .NIHTBX_PICTURE    0.777    0.024   32.574    0.000    0.777    0.777
##    .PEA_RAVLT_LD_T    0.723    0.026   28.200    0.000    0.723    0.724
##     g                 0.446    0.029   15.272    0.000    1.000    1.000
## 
## R-Square:
##                    Estimate
##     NIHTBX_PICVOCA    0.446
##     NIHTBX_READING    0.472
##     NIHTBX_FLANKER    0.200
##     NIHTBX_PATTERN    0.128
##     NIHTBX_PICTURE    0.223
##     PEA_RAVLT_LD_T    0.276

Plabels = c("Vocab","Reading","Flanker","Pattern","Picture","RAVLT", expression(paste(italic("g"))))
semPlot::semPaths(object=NeuroCogSingleFactor.Fit,
                  intercepts = F, residuals = F, 
                  whatLabels="no", 
                  what = "std", 
                  layout="tree",
                  node.width = 1.4,
                  edge.label.cex = .6, 
                  nodeLabels=Plabels, 
                  edge.color="black",
                  sizeMan = 10, sizeLat = 10)

3.1.12 testing the stability of g-factor

data_first_year <- bind_rows(data_train,data_valid,data_baseline)

NeuroCog2ndOrder <-'
Language =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED
CognitiveFlexibity =~ NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED
MemoryRecall =~ NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC
g =~ NA*Language + CognitiveFlexibity  + MemoryRecall #estimate the loading of GenAbi -> as opposed to using it as a marker
g ~~ 1*g #need to constrain variance to 1'

NeuroCog2ndOrder.Fit_First_year <- lavaan::cfa(model = NeuroCog2ndOrder, 
                                               data = data_first_year,
                                               estimator="MLR")

CFA_NeuroCog2ndOrder_first_year <- 
   data_first_year %>% select(SUBJECTKEY) %>%
   bind_cols(
    lavaan::lavPredict(NeuroCog2ndOrder.Fit_First_year, 
                       newdata =data_first_year)  %>% as_tibble() %>% 
    rename(g_2ndOrder_first_year = g)) %>%
  select(SUBJECTKEY, g_2ndOrder_first_year)

CFA_all_first_year <- 
  CFA_response$enet_tuning %>% bind_cols(fold = "first_layer") %>%
  bind_rows(CFA_response$RanFor_tuning %>% bind_cols(fold = "second_layer") %>%
              bind_rows(CFA_response$testing_baseline %>% bind_cols(fold = "baseline_test"))) %>%
  left_join(CFA_NeuroCog2ndOrder_first_year, by = "SUBJECTKEY")

3.1.13 correlation in 2nd-order g between those based on first layer train vs. whole baseline

CFA_all_first_year %>% 
  group_by(fold) %>%
  summarize (cor=cor (g_second_order, g_2ndOrder_first_year))

## # A tibble: 3 x 2
##   fold            cor
##   <chr>         <dbl>
## 1 baseline_test 0.997
## 2 first_layer   0.997
## 3 second_layer  0.997

fold.labs <- c("1st-layer train\nr = .9973686", "2nd-layer train\nr = .9973635", "Baseline Test\nr = .9970127")
names(fold.labs) <- c("first_layer", "second_layer","baseline_test" )

CFA_all_first_year %>%
  mutate(fold=factor(fold, levels=c("first_layer", "second_layer","baseline_test"))) %>%
  ggplot(aes(x=g_second_order, y=g_2ndOrder_first_year)) +
  geom_point() + 
  geom_smooth(method=lm, color = "grey") + 
  facet_wrap(~fold, ncol = 3, nrow = 1,
             labeller = labeller(fold = fold.labs)) +
  labs(x = expression(paste("2nd-order ",italic("g"), "-Factor Based on 1st-Layer Train Data")),
       y = ~ atop(paste("2nd-order ",italic("g"),"-Factor"), "Based on Full Baseline Data")) +
   theme(legend.position = "none") +
   #ggpubr::stat_cor(method = "pearson") + #label.x = -5, label.y = 30)
   theme(text=element_text(size=19))

## Don't know how to automatically pick scale for object of type lavaan.matrix/matrix. Defaulting to continuous.

## `geom_smooth()` using formula 'y ~ x'

lowerFn <- function(data, mapping, continuous = "cor", ...) {
  p <- ggplot(data = data, mapping = mapping) +
    geom_point(colour = "black") +
    geom_smooth(color = "grey", ...)
  p
}

CFA_all_first_year_g <- CFA_all_first_year %>%
  select(g_second_order,g_single_factor,g_EFA_CFA)

colnames(CFA_all_first_year_g) <- 
  make.names(c('2nd Order', 'Single Factor', 'EFA CFA'))

g_first_layer_plot <- CFA_all_first_year %>%
  filter(fold == "first_layer") %>%
  select(g_second_order,g_single_factor,g_EFA_CFA) %>%
  GGally::ggpairs(.,
                  title = "1st-Layer Train Data", 
                  columnLabels = gsub('X', '', gsub('.', '\n', colnames(CFA_all_first_year_g), fixed = T)), 
                  lower = list(continuous = lowerFn))

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

g_second_layer_plot <- CFA_all_first_year %>%
  filter(fold == "second_layer") %>%
  select(g_second_order,g_single_factor,g_EFA_CFA) %>%
  GGally::ggpairs(.,
                  title = "2nd-Layer Train Data", 
                  columnLabels = gsub('X', '', gsub('.', '\n', colnames(CFA_all_first_year_g), fixed = T)), 
                  lower = list(continuous = lowerFn)) 

g_baseline_test_plot <- CFA_all_first_year %>%
  filter(fold == "baseline_test") %>%
  select(g_second_order,g_single_factor,g_EFA_CFA) %>%
  GGally::ggpairs(.,
                  title = "Baseline Test Data",
                  columnLabels = gsub('X', '', gsub('.', '\n', colnames(CFA_all_first_year_g), fixed = T)), 
                  lower = list(continuous = lowerFn))

g_followup_test_plot <- 
CFA_response$testing_followup %>%
  select(g_second_order,g_single_factor,g_EFA_CFA) %>%
  GGally::ggpairs(.,
                  title = "Follow-up Test Data",
                  columnLabels = gsub('X', '', gsub('.', '\n', colnames(CFA_all_first_year_g), fixed = T)), 
                  lower = list(continuous = lowerFn))

G_plot_grid <- cowplot::plot_grid(
  GGally::ggmatrix_gtable(g_first_layer_plot),
  GGally::ggmatrix_gtable(g_second_layer_plot),
  GGally::ggmatrix_gtable(g_baseline_test_plot),
  GGally::ggmatrix_gtable(g_followup_test_plot),
 # ggmatrix_gtable(g3),
  nrow = 2
)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# now add the title
title <- ggdraw() + 
  draw_label(
    expression(paste("Relationship among ",italic("g"), "-Factor Based on Different CFA models at Different Data Splits")),
    fontface = 'bold',
    x = 0,
    hjust = 0
  ) +
  theme(
    # add margin on the left of the drawing canvas,
    # so title is aligned with left edge of first plot
    plot.margin = margin(0, 0, 0, 7)
  )
plot_grid(
  title, G_plot_grid,
  ncol = 1,
  # rel_heights values control vertical title margins
  rel_heights = c(0.05, 1)
)

3.1.14 Joining extracted factor scores with the data:

#CFA_Resp_Var <- names(CFA_response[["enet_tuning"]])[-1]%>% set_names()
## to focus on g_second_order, remove g_single_factora and g_EFA_CFA
CFA_Resp_Var <- names(CFA_response[["enet_tuning"]] %>%
                        select(-g_single_factor, -g_EFA_CFA))[-1]%>% set_names()


train_CFA <- left_join(CFA_response[["enet_tuning"]],
                       analysis( baseline_two_fold$splits[[1]]), 
                       by = "SUBJECTKEY")
valid_CFA <- left_join(CFA_response[["RanFor_tuning"]],
                       assessment( baseline_two_fold$splits[[1]]), 
                       by = "SUBJECTKEY")
baseline_CFA <- left_join(CFA_response[["testing_baseline"]],
                          baseline_test, 
                          by = "SUBJECTKEY")
followup_CFA <- left_join(CFA_response[["testing_followup"]],
                          followup_analysis, 
                          by = "SUBJECTKEY")

3.1.15 look at the number

nrow(train_CFA)

## [1] 2906

nrow(valid_CFA)

## [1] 2917

nrow(baseline_CFA )

## [1] 5427

nrow(followup_CFA )

## [1] 5280

3.1.16 Plot distribution of G factor

GDistData <- data.table::data.table(G_Factor = train_CFA$g_second_order, Split = "1st-Layer Train") %>%
  rbind(data.table::data.table(G_Factor = valid_CFA$g_second_order, Split = "2nd-Layer Train")) %>%
  rbind(data.table::data.table(G_Factor = baseline_CFA$g_second_order, Split = "Baseline Test")) %>%
  rbind(data.table::data.table(G_Factor = followup_CFA$g_second_order, Split = "Follow-Up Test")) 

ggplot(data = GDistData, aes(x = G_Factor, color = Split)) + 
  geom_density() +
  labs(x= expression(paste("2nd-order ",italic("g")))) +
  labs(y= "Density") +
  theme(legend.title=element_blank()) +
  theme(text = element_text(size=30)) +
  theme(legend.position="top", legend.box="vertical", legend.margin=margin()) +
  guides(color=guide_legend(nrow=2,byrow=TRUE))

3.1.17 set CFA variable names for ploting

CFA_var_plotting <- tibble("response" =CFA_Resp_Var, 
                           "longer_name"=c("1st-Order Language", 
                                           "1st-Order Mental Flexibility", 
                                           "1st-Order Memory Recall", 
                                           "2nd-Order G-Factor"),
                           "short_name"=c("Language", 
                                          "Mental Flexibility", 
                                          "Memory Recall",
                                          "G-Factor"))

3.2 model fitting for Confirmatory factor analysis

3.2.1 data preping for elastic net for factor scores

Nback_enet_CFA <- CFA_Resp_Var %>% 
  map(.,~enet_processing_nocom(resp_var = .,
                               split_train = train_CFA, 
                               split_valid =  valid_CFA, 
                               split_test = baseline_CFA,
                               followup_test = followup_CFA,
                               contrast_name = "X2backvs0back_ROI_"))

SST_enet_CFA <- CFA_Resp_Var %>% 
  map(.,~enet_processing_nocom(resp_var = .,
                               split_train = train_CFA, 
                               split_valid =  valid_CFA, 
                               split_test = baseline_CFA,
                               followup_test = followup_CFA ,
                               contrast_name = "anystopvscorrectgo_ROI_"))

MID_enet_CFA <- CFA_Resp_Var %>% 
  map(.,~enet_processing_nocom(resp_var = .,
                               split_train = train_CFA, 
                               split_valid =  valid_CFA, 
                               split_test = baseline_CFA,
                               followup_test = followup_CFA,
                               contrast_name = "antiRewVsNeu_ROI_"))

DTI_enet_CFA <-  CFA_Resp_Var %>% 
  map(.,~enet_processing_comOne(resp_var = .,
                                split_train = train_CFA, 
                                split_valid =  valid_CFA, 
                                split_test = baseline_CFA,
                                followup_test = followup_CFA,
                                measure_one = "FA_"))

smri_enet_CFA <- CFA_Resp_Var %>% 
  map(.,~enet_processing_com(resp_var = .,
                             split_train = train_CFA, 
                             split_valid =  valid_CFA, 
                             split_test = baseline_CFA,
                             followup_test = followup_CFA,
                             measure_one = "ASEG_",
                             measure_two = "Dest_Thick_"))

rsmri_enet_CFA <- CFA_Resp_Var %>% 
  map(.,~enet_processing_com(resp_var = .,
                             split_train = train_CFA, 
                             split_valid =  valid_CFA, 
                             split_test = baseline_CFA,
                             followup_test = followup_CFA,
                             measure_one = "par",
                             measure_two = "Within"))

enet_CFA_data <- list("Nback"=Nback_enet_CFA,
                      "SST"=SST_enet_CFA,
                      "MID"=MID_enet_CFA,
                      "rsmri"=rsmri_enet_CFA,
                      "smri"=smri_enet_CFA,
                      "DTI"=DTI_enet_CFA)

3.2.2 actual parameter tuning for elastic net for factor scores

##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)

Nback_CFA <- CFA_Resp_Var %>% 
  future_map(.,~enet_tuning(resp_var =.,
                            data_list = Nback_enet_CFA[[.]]),
             .options = furrr_options(seed=TRUE))

SST_CFA <- CFA_Resp_Var %>% 
  future_map(.,~enet_tuning(resp_var =.,
                            data_list = SST_enet_CFA[[.]]),
             .options = furrr_options(seed=TRUE))

MID_CFA <- CFA_Resp_Var %>% 
  future_map(.,~enet_tuning(resp_var =.,
                            data_list = MID_enet_CFA[[.]]),
             .options = furrr_options(seed=TRUE))

DTI_CFA <-  CFA_Resp_Var %>% 
  future_map(.,~enet_tuning(resp_var =.,
                            data_list = DTI_enet_CFA[[.]]),
             .options = furrr_options(seed=TRUE))

smri_CFA <- CFA_Resp_Var %>% 
  future_map(.,~enet_tuning(resp_var =.,
                            data_list = smri_enet_CFA[[.]]),
             .options = furrr_options(seed=TRUE))

rsmri_CFA <- CFA_Resp_Var %>% 
  future_map(.,~enet_tuning(resp_var =.,
                            data_list = rsmri_enet_CFA[[.]]),
             .options = furrr_options(seed=TRUE))

enet_CFA <- list("Nback_results"=Nback_CFA,
                 "SST_results"=SST_CFA,
                 "MID_results"=MID_CFA,
                 "rsmri_results"=rsmri_CFA,
                 "smri_results"=smri_CFA,
                 "DTI_results"=DTI_CFA)

3.2.3 create table of the number of subjects in each data splits

datasplits_number(data_list = enet_CFA,resp_vec = CFA_Resp_Var, resp_plotting = CFA_var_plotting)

[[1]]

Sample size of all the data sets
modality	enet_train	stack_train	baseline	followup	short_name
Nback	1369	1331	2653	2996	Language
Nback	1369	1331	2653	2996	Mental Flexibility
Nback	1369	1331	2653	2996	Memory Recall
Nback	1369	1331	2653	2996	G-Factor

[[2]]

Sample size of all the data sets
modality	enet_train	stack_train	baseline	followup	short_name
SST	1412	1421	2791	2903	Language
SST	1412	1421	2791	2903	Mental Flexibility
SST	1412	1421	2791	2903	Memory Recall
SST	1412	1421	2791	2903	G-Factor

[[3]]

Sample size of all the data sets
modality	enet_train	stack_train	baseline	followup	short_name
MID	1611	1591	3138	3127	Language
MID	1611	1591	3138	3127	Mental Flexibility
MID	1611	1591	3138	3127	Memory Recall
MID	1611	1591	3138	3127	G-Factor

[[4]]

Sample size of all the data sets
modality	enet_train	stack_train	baseline	followup	short_name
rsmri	1988	2007	3779	4137	Language
rsmri	1988	2007	3779	4137	Mental Flexibility
rsmri	1988	2007	3779	4137	Memory Recall
rsmri	1988	2007	3779	4137	G-Factor

[[5]]

Sample size of all the data sets
modality	enet_train	stack_train	baseline	followup	short_name
smri	2395	2428	4507	4569	Language
smri	2395	2428	4507	4569	Mental Flexibility
smri	2395	2428	4507	4569	Memory Recall
smri	2395	2428	4507	4569	G-Factor

[[6]]

Sample size of all the data sets
modality	enet_train	stack_train	baseline	followup	short_name
DTI	2268	2260	4071	4230	Language
DTI	2268	2260	4071	4230	Mental Flexibility
DTI	2268	2260	4071	4230	Memory Recall
DTI	2268	2260	4071	4230	G-Factor

modality_enet_CFA <-names(enet_CFA)

param_tune_enet <- modality_enet_CFA %>% 
  map(.,function(modality_input=.,list_input=enet_CFA){
  tuned_results <- list_input[[modality_input]][["g_second_order"]][["param_tune"]]%>%
                   mutate(modality=modality_input)
  return(tuned_results)
})%>% do.call(rbind,.)%>%
  select("penalty", "mixture","modality")

 param_tune_enet %>%
  kableExtra::kbl(caption = "Tuned parameters for elastic net") %>%
                                     kableExtra::kable_classic(full_width = F, html_font = "Cambria")

Tuned parameters for elastic net
penalty	mixture	modality
1.1489510	0.0	Nback_results
0.0195640	1.0	SST_results
0.0252354	1.0	MID_results
0.6080224	0.0	rsmri_results
0.1162322	0.1	smri_results
0.1162322	0.0	DTI_results

3.2.4 look at the mean and sd of penalty

# mean of penalty term 0.6190428    
param_tune_enet %>% summarise_at("penalty", mean)

## # A tibble: 1 x 1
##   penalty
##     <dbl>
## 1   0.339

# sd of penalty term 0.4316607  
param_tune_enet %>% summarise_at("penalty", sd)

## # A tibble: 1 x 1
##   penalty
##     <dbl>
## 1   0.453

3.2.5 Fit the tuned enet model to enetXplorer for the factor scores

enetXplorer allows us to use permutation to test the significance of each feature

enet_CFA_xplorer <- enet_CFA
names(enet_CFA_xplorer) <- modality_vec

##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)

enet_CFA_fit <- modality_vec  %>%
  map(.,~fitting_enetxplorer(resp_vec = CFA_Resp_Var,
                             modality_var = .,
                             data_input = enet_CFA_data,
                             tuning_input = enet_CFA_xplorer))

names(enet_CFA_fit)<- modality_vec

3.2.6 results of enetXplorer for each factor score

modality_vec %>% map(.,~enet_fit_table(modality_input=.,
                                       fit_input = enet_CFA_fit, 
                                       tune_input = enet_CFA_xplorer,
                                       resp_input=CFA_Resp_Var, 
                                       plotting_names = CFA_var_plotting))

[[1]]

EnetXplorer fitting of Nback
Best-tune lambda	Accuracy (Pearson r)	P-value	respones
1.1486	0.4058	0.0004	Language
2.0159	0.2895	0.0004	Mental Flexibility
2.1021	0.3251	0.0004	Memory Recall
1.2884	0.4097	0.0004	G-Factor

[[2]]

EnetXplorer fitting of SST
Alpha	Best-tune lambda	Accuracy (Pearson r)	P-value	respones
1.0	0.0076	0.1731	0.0004	Language
1.0	0.0211	0.1447	0.0004	Mental Flexibility
0.1	0.0087	0.1218	0.0012	Memory Recall
1.0	0.0071	0.1784	0.0004	G-Factor

[[3]]

EnetXplorer fitting of MID
Alpha	Best-tune lambda	Accuracy (Pearson r)	P-value	respones
1	0.0284	0.2756	0.0004	Language
0	2.2486	0.1614	0.0004	Mental Flexibility
0	1.5746	0.2128	0.0004	Memory Recall
1	0.0236	0.2649	0.0004	G-Factor

[[4]]

EnetXplorer fitting of rsmri
Best-tune lambda	Accuracy (Pearson r)	P-value	respones
0.3352	0.2410	0.0004	Language
1.2950	0.1434	0.0004	Mental Flexibility
1.2108	0.1681	0.0004	Memory Recall
0.4319	0.2281	0.0004	G-Factor

[[5]]

EnetXplorer fitting of smri
Alpha	Best-tune lambda	Accuracy (Pearson r)	P-value	respones
0.1	0.0913	0.2483	0.0004	Language
0.0	2.6983	0.1540	0.0004	Mental Flexibility
0.0	2.0348	0.1881	0.0004	Memory Recall
0.1	0.1134	0.2387	0.0004	G-Factor

[[6]]

EnetXplorer fitting of DTI
Alpha	Best-tune lambda	Accuracy (Pearson r)	P-value	respones
0	0.0128	0.2370	0.0004	Language
1	0.0002	0.1776	0.0004	Mental Flexibility
0	0.0394	0.1758	0.0004	Memory Recall
0	0.0134	0.2292	0.0004	G-Factor

3.2.7 Plot results from eNetExplorer with task-fMRI predicting each factor score

map2(.x=modality_type_vec,
     .y=str_type_vec,
     ~enet_interval_plot(data_input=enet_CFA_fit,
                         tuning_input=enet_CFA_xplorer,
                         resp_input =CFA_Resp_Var,
                         modality_input = .x,
                         str_input = .y, 
                         plotting_names = CFA_var_plotting))

## [[1]]
## [[1]]$Lang_first_order

## 
## [[1]]$CogFlex_first_order

## 
## [[1]]$MemRe_first_order

## 
## [[1]]$g_second_order

## 
## 
## [[2]]
## [[2]]$Lang_first_order

## 
## [[2]]$CogFlex_first_order

## 
## [[2]]$MemRe_first_order

## 
## [[2]]$g_second_order

## 
## 
## [[3]]
## [[3]]$Lang_first_order

## 
## [[3]]$CogFlex_first_order

## 
## [[3]]$MemRe_first_order

## 
## [[3]]$g_second_order

3.2.8 Plot results from eNetExplorer with sMRI predicting each factor score

enet_interval_plot_smri(data_input=enet_CFA_fit,tuning_input=enet_CFA,resp_input = CFA_Resp_Var, plotting_names = CFA_var_plotting)

## $Lang_first_order

## 
## $CogFlex_first_order

## 
## $MemRe_first_order

## 
## $g_second_order

### Plot results from eNetExplorer with rs-fMRI predicting each factor score

enet_interval_plot_rsmri(data_input=enet_CFA_fit,tuning_input=enet_CFA,resp_input = CFA_Resp_Var, plotting_names = CFA_var_plotting)

## $Lang_first_order

## 
## $CogFlex_first_order

## 
## $MemRe_first_order

## 
## $g_second_order

3.2.9 Plot results from eNetExplorer with DTI predicting each cognitive task

enet_interval_plot_DTI(data_input=enet_CFA_fit,tuning_input=enet_CFA,resp_input = CFA_Resp_Var, plotting_names = CFA_var_plotting)

## $Lang_first_order

## 
## $CogFlex_first_order

## 
## $MemRe_first_order

## 
## $g_second_order

3.2.10 Plot brain images from eNetExplorer for tasks and sMRI predicting G-Factor

    library(ggseg)
    library(ggsegExtra)

## Loading required package: ggseg3d

## Registered S3 method overwritten by 'geomorph':
##   method    from   
##   print.pls parsnip

    library(ggsegDesterieux)

    
    modality_type_vec <- list("Nback","SST","MID","smri")
    # modality_word_vec <- list("NBack-fMRI", "SST-fMRI","MID-fMRI","sMRI")
    modality_type_word_vec <- list(c("Nback","NBack-fMRI"),
                                   c("SST","SST-fMRI"),
                                   c("MID","MID-fMRI"),
                                   c("smri","sMRI"))

    str_type_vec <- list("X2backvs0back_ROI_",
                         "anystopvscorrectgo_ROI_",
                         "antiRewVsNeu_ROI_",
                         c("ASEG_","Dest_Thick_")) 
    
    #CFA_Resp_vec <- c("Lang_first_order", "CogFlex_first_order", "MemRe_first_order", "g_second_order")
    
    brainPrepTibFunc <- function(enet_CFA_fit, 
                                 modality_type, 
                                 str_type, 
                                 CFA_Resp) { 
      
      if (modality_type[1] == "smri") {
        brainPlotTib <- extract_tibble(enet_CFA_fit[[modality_type[1]]][[CFA_Resp]], 
                                       alpha_index = paste0("a",
                                                            enet_CFA_xplorer[[modality_type[1]]][[CFA_Resp]][["param_tune"]][["mixture"]])) %>%
          filter(type == "Target") %>% 
          mutate(.,label = str_replace_all(variable, str_type[1], '')) %>%
          mutate(.,label = str_replace_all(label, str_type[2], '')) %>%
          mutate(.,label = str_replace_all(label, '\\.', '-')) %>%
          mutate(.,label = str_replace_all(label, 'Brain-Stem', 'brain-stem')) %>% 
          mutate(.,label = str_replace_all(label, 'Right-Cerebellum-Cortex', 'right-cerebellum-cortex'))
        }
      else{
      brainPlotTib <- extract_tibble(enet_CFA_fit[[modality_type[1]]][[CFA_Resp]], 
                     alpha_index = paste0("a",
                                          enet_CFA_xplorer[[modality_type[1]]][[CFA_Resp]][["param_tune"]][["mixture"]])) %>%
      filter(type == "Target") %>% 
      mutate(.,label = str_replace_all(variable, str_type, '')) %>%
      mutate(.,label = str_replace_all(label, '\\.', '-')) %>%
      mutate(.,label = str_replace_all(label, 'Brain-Stem', 'brain-stem')) %>% 
      mutate(.,label = str_replace_all(label, 'Right-Cerebellum-Cortex', 'right-cerebellum-cortex'))
      }
      
      coefs_ggsegDes_all <- 
        ggsegDesterieux::desterieux %>% as_tibble %>% 
        select(label) %>% 
        na.omit() %>% 
        left_join(select(brainPlotTib, 
                         label, 
                         wmean, 
                         pvalue,
                         variable), by = "label")
      
      coefs_ggsegAseg_all <- 
        ggseg::aseg$data %>% as_tibble %>% 
        select(label) %>% 
        na.omit() %>% 
        left_join(select(brainPlotTib, 
                         label, 
                         wmean, 
                         pvalue,
                         variable), by = "label") 
      
      coefs_ggsegMax <- max(max(coefs_ggsegDes_all$wmean, na.rm = TRUE),
                            max(coefs_ggsegAseg_all$wmean, na.rm = TRUE))
      coefs_ggsegMin <- min(min(coefs_ggsegDes_all$wmean, na.rm = TRUE),
                            min(coefs_ggsegAseg_all$wmean, na.rm = TRUE))
      
      plot_ggsegDes_all <- 
        ggsegDesterieux::desterieux %>% as_tibble %>% 
        right_join(coefs_ggsegDes_all, by ="label") %>%
        select(label,  wmean, pvalue) %>% filter(pvalue <.05) %>%
        ggseg(atlas = 'desterieux', 
              mapping = aes(fill = wmean),
              colour="black"
        ) + 
        theme_void() +
        scale_fill_gradient2(
          limits = c(coefs_ggsegMin, coefs_ggsegMax), 
          midpoint = 0, low = "blue", mid = "white",
          high = "red", space = "Lab", na.value="transparent" ) +
        theme(legend.position = "none")+
        labs(title =paste0(modality_type[2])) 
      
      plot_ggsegAseg_all <- 
      ggseg::aseg$data %>% as_tibble %>% 
        right_join(coefs_ggsegAseg_all, by ="label") %>%
        select(label,  wmean, pvalue) %>% filter(pvalue <.05) %>%
        ggseg(atlas="aseg", 
              mapping=aes(fill=wmean),
              view = "axial",
              colour="black") +
        theme_void() +
        scale_fill_gradient2(
              limits = c(coefs_ggsegMin, coefs_ggsegMax),
          midpoint = 0, low = "blue", mid = "white",
          high = "red", space = "Lab", na.value="transparent" ) +
        guides(fill = guide_colourbar(barwidth = 0.5, barheight = 3, title = NULL))
        theme(legend.position = "none")
      
      plotPreptaskfMRI <- 
        list(brainPlotTib,
             coefs_ggsegDes_all, 
             coefs_ggsegAseg_all,
             coefs_ggsegMax,
             coefs_ggsegMin,
             plot_ggsegDes_all,
             plot_ggsegAseg_all) %>% 
        purrr::set_names("brainPlotTib","coefs_ggsegDes_all", "coefs_ggsegAseg_all",
                         "coefs_ggsegMax","coefs_ggsegMin","plot_ggsegDes_all", "plot_ggsegAseg_all")
      
      return(plotPreptaskfMRI)
    }
    
    plotPreptaskfMRI  <-
      modality_type_word_vec %>%
      purrr::set_names(purrr::map(.,1)) %>% 
      purrr::map2(.x = ., 
                 .y = str_type_vec,
         ~  brainPrepTibFunc(enet_CFA_fit = enet_CFA_fit,
                             modality_type = .x,
                           str_type = .y,
                           CFA_Resp = "g_second_order"))

## merging atlas and data by 'label'

## merging atlas and data by 'label'

## Warning: Some data not merged properly. Check for naming errors in data:
##   atlas type  hemi  side  region label                    wmean  pvalue
##   <chr> <chr> <chr> <chr> <chr>  <chr>                    <dbl>   <dbl>
## 1 <NA>  <NA>  <NA>  <NA>  <NA>   right-cerebellum-cortex 0.0301 0.00120

## merging atlas and data by 'label'
## merging atlas and data by 'label'
## merging atlas and data by 'label'
## merging atlas and data by 'label'
## merging atlas and data by 'label'
## merging atlas and data by 'label'

    ggpubr::ggarrange(plotlist=purrr::map(plotPreptaskfMRI,"plot_ggsegDes_all")
              ,nrow=4)

    # ggpubr::ggarrange(plotlist=purrr::map(plotPreptaskfMRI,"plot_ggsegAs_all")
    #                   ,nrow=4)
    #               
    modality_type_vec %>% map(.,~gridExtra::grid.arrange(plotPreptaskfMRI[[.]][["plot_ggsegDes_all"]],
                                                         plotPreptaskfMRI[[.]][["plot_ggsegAseg_all"]],
                                                         nrow = 1, ncol = 2, widths = c(4, 1.5)))

## [[1]]
## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 
## [[2]]
## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 
## [[3]]
## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 
## [[4]]
## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]

3.2.11 Plot brain images from eNetExplorer for DTI predicting G-Factor

  library(ggsegJHU)
    
    HaglerToJhu <- read_csv(paste0(utilFold, "HaglerToJhu.csv"))

## Rows: 44 Columns: 5
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): hemi, region, side, label, Hagler
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

    DTITrib <- extract_tibble(enet_CFA_fit[["DTI"]][["g_second_order"]], 
                   alpha_index = paste0("a", enet_CFA_xplorer[["DTI"]][["g_second_order"]][["param_tune"]][["mixture"]])) %>% 
      filter(type == "Target") %>%  
      mutate(.,Hagler = str_replace_all(variable, "FA_", '')) 
   
    ggsegJHU::jhu %>% as_tibble %>% 
      left_join(HaglerToJhu, by = c("hemi","region","side","label")) %>% 
      left_join(DTITrib %>% select(-type), by = "Hagler") %>% 
      filter(pvalue <.05) %>%
      select( -geometry) %>%
      ggseg(atlas = 'jhu', 
            mapping = aes(fill = wmean),
            colour="black") + 
      theme_void() +
    theme(text = element_text(size=30)) + 
    labs(title = "DTI") +
      scale_fill_gradient2(
#        limits = c(coefs_ggsegMin, coefs_ggsegMax),
        midpoint = 0, low = "blue", mid = "white",
        high = "red", space = "Lab", na.value="transparent" ) +
      guides(fill = guide_colourbar(barwidth = 1.5, barheight = 9, title = NULL))

## merging atlas and data by 'atlas', 'type', 'hemi', 'region', 'side', 'label'

    theme(legend.position = "none")

## List of 1
##  $ legend.position: chr "none"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

3.3 Combine the predicted values with Random Forest Oppornistic Stacking for factor scores

3.3.1 Random Forest Data preparation for factor scores

enet_rf_valid_CFA <- CFA_Resp_Var %>% 
  map(.,~rnd_for_data_prep(pred_select = "predicted",
                           data_list = enet_CFA, 
                           resp_chose = .,
                           resp_data = valid_CFA))

enet_rf_test_baseline_CFA <- CFA_Resp_Var %>% 
  map(.,~rnd_for_data_prep(pred_select="predicted_baseline", 
                           data_list = enet_CFA, 
                           resp_chose = .,
                           resp_data = baseline_CFA))

enet_rf_test_followup_CFA <- CFA_Resp_Var %>% 
  map(.,~rnd_for_data_prep(pred_select="predicted_followup", 
                           data_list = enet_CFA, 
                           resp_chose = .,
                           resp_data = followup_CFA))


CFA_stack_valid <- CFA_Resp_Var %>% 
  map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_valid_CFA))%>%
  do.call(rbind,.)%>%
  mutate(number_valid = datasplit_number)%>%
  subset(.,select=-c(datasplit_number))

CFA_stack_baseline <- CFA_Resp_Var %>% 
  map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_test_baseline_CFA))%>%
  do.call(rbind,.)%>%
  mutate(baseline_test = datasplit_number)%>%
  subset(.,select=-c(datasplit_number))


CFA_stack_followup <- CFA_Resp_Var %>% 
  map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_test_followup_CFA))%>%
  do.call(rbind,.)%>%
  mutate(followup_test = datasplit_number)%>%
  subset(.,select=-c(datasplit_number))

plyr::join_all(list(CFA_stack_valid,CFA_stack_baseline,CFA_stack_followup,CFA_var_plotting), 
                 type="left", by = "response")%>%
  select("short_name","number_valid",ends_with("test"))%>%
  kableExtra::kbl(caption = paste0("Sample size of all the data sets in stacked models ")) %>%
   kableExtra::kable_classic(full_width = F, html_font = "Cambria")

Sample size of all the data sets in stacked models
short_name	number_valid	baseline_test	followup_test
Language	2740	5108	5119
Mental Flexibility	2740	5108	5119
Memory Recall	2740	5108	5119
G-Factor	2740	5108	5119

3.3.2 random forest model fit for factor scores

##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)

rf_fit_CFA <- CFA_Resp_Var %>% 
  future_map(.,~random_forest_tuning(resp=.,
                                     valid_list = enet_rf_valid_CFA,
                                     baseline_test = enet_rf_test_baseline_CFA,
                                     followup_test = enet_rf_test_followup_CFA),
             .options = furrr_options(seed=TRUE),
             .progress = TRUE)

3.3.3 print the tuned parameters and model performace of the random forest model for each factor score

CFA_rf_results <- random_fores_output_print(rf_output = rf_fit_CFA,
                                            enet_output = enet_CFA,
                                            response_vec=CFA_Resp_Var,
                                            names_plotting = CFA_var_plotting)

CFA_rf_results[["rf_performance"]]%>%
  rename("Response Variable" = respones, 
         "Baseline" = "Baseline (Pearson r)", 
         "Followup" = "Followup  (Pearson r)") %>% 
  select("Response Variable", 
         "Baseline","Followup", 
         "#predictors/split",
         "#observations/split") %>%
  mutate(across(2:3, round, 3)) %>% 
  kableExtra::kbl(caption = paste0("Performance (Pearson r) and Hyperparameters of Random Forest Stacking for Factor Scores")) %>%
  kableExtra::kable_classic(full_width = F, html_font = "Cambria")

Performance (Pearson r) and Hyperparameters of Random Forest Stacking for Factor Scores
Response Variable	Baseline	Followup	#predictors/split	#observations/split
Language	0.423	0.418	4	160
Mental Flexibility	0.314	0.305	8	350
Memory Recall	0.369	0.348	4	200
G-Factor	0.434	0.414	4	170

CFA_rf_results[["rf_enet_baseline"]] %>%
  rename("Response Variable" = short_name,
         "NBack-fMRI" = Nback,
         "SST-fMRI" = SST,
         "MID-fMRI" = MID,
         "rs-fMRI" = rsmri,
         "sMRI" = smri,
         "Stacked" = random_forest_baseline) %>% 
  select("Response Variable", 
         "Stacked",
         "NBack-fMRI",
         "MID-fMRI",
         "SST-fMRI",
         "rs-fMRI",
         "sMRI",
         "DTI") %>%
  mutate(across(2:8, round, 3)) %>% 
  kableExtra::kbl(caption = paste0("Performance of Random Forest Stacking VS Elastic Net on Each Modality for Each Factor Score\n at the Baseline(Pearson r)")) %>%
  kableExtra::kable_classic(full_width = F, html_font = "Cambria")

Performance of Random Forest Stacking VS Elastic Net on Each Modality for Each Factor Score at the Baseline(Pearson r)
Response Variable	Stacked	NBack-fMRI	MID-fMRI	SST-fMRI	rs-fMRI	sMRI	DTI
Language	0.423	0.388	0.197	0.140	0.228	0.211	0.192
Mental Flexibility	0.314	0.313	0.144	0.063	0.133	0.132	0.144
Memory Recall	0.369	0.315	0.169	0.082	0.182	0.179	0.166
G-Factor	0.434	0.402	0.202	0.129	0.224	0.210	0.193

CFA_rf_results[["rf_enet_followup"]] %>%
   rename("Response Variable" = short_name,
         "NBack-fMRI" = Nback,
         "SST-fMRI" = SST,
         "MID-fMRI" = MID,
         "rs-fMRI" = rsmri,
         "sMRI" = smri,
         "Stacked" = random_forest_followup) %>% 
  select("Response Variable", 
         "Stacked",
         "NBack-fMRI",
         "MID-fMRI",
         "SST-fMRI",
         "rs-fMRI",
         "sMRI",
         "DTI") %>%
  mutate(across(2:8, round, 3)) %>% 
  kableExtra::kbl(caption = paste0("Performance of Random Forest Stacking VS Elastic Net on Each Modality for Each Factor Score\n at the Follow-Up (Pearson r)")) %>%
  kableExtra::kable_classic(full_width = F, html_font = "Cambria")

Performance of Random Forest Stacking VS Elastic Net on Each Modality for Each Factor Score at the Follow-Up (Pearson r)
Response Variable	Stacked	NBack-fMRI	MID-fMRI	SST-fMRI	rs-fMRI	sMRI	DTI
Language	0.418	0.370	0.149	0.150	0.254	0.236	0.211
Mental Flexibility	0.305	0.292	0.090	0.074	0.166	0.156	0.173
Memory Recall	0.348	0.319	0.123	0.130	0.206	0.189	0.188
G-Factor	0.414	0.383	0.148	0.145	0.253	0.230	0.217

CFA_Resp_Var%>% map(.,~rf_fit_CFA[[.]][["model_fit"]] %>%
  vip(geom = "point"))

## $Lang_first_order

## 
## $CogFlex_first_order

## 
## $MemRe_first_order

## 
## $g_second_order

3.3.4 Plot conditional permutated variable importance of the random forest stacking model for each factor score

enet_rf_valid_CFA_permu <- enet_rf_valid_CFA %>% map(.,function(data_input=.){
 data_output<- data_input[["NoID"]] %>% 
   dplyr::rename("MID_large"="MID_pred.x",
                 "SST_large"="SST_pred.x",
                 "smri_large"="smri_pred.x",
                 "rsmri_large"="rsmri_pred.x",
                 "DTI_large"="DTI_pred.x",
                 "Nback_large"="Nback_pred.x",
                 "MID_small"="MID_pred.y",
                 "SST_small"="SST_pred.y",
                 "smri_small"="smri_pred.y",
                 "rsmri_small"="rsmri_pred.y",
                 "DTI_small"="DTI_pred.y",
                 "Nback_small"="Nback_pred.y")
 return(data_output)
  })

cf_fit_CFA <- CFA_Resp_Var %>% map(.,function(resp_var=.){
  cforest(as.formula(paste0(resp_var, "~ .")),
          data=enet_rf_valid_CFA_permu[[resp_var]],
          control= cforest_unbiased(mtry=rf_fit_CFA[[resp_var]][["param_tune"]]$mtry,
                                    ntree=1000,
                                    minsplit=rf_fit_CFA[[resp_var]][["param_tune"]]$min_n))
}) 

PI_permimp_CFA <-cf_fit_CFA %>% 
  map(.,~permimp(.,progressBar = FALSE)) 

CFA_Resp_Var %>% 
  map(.,~plot(PI_permimp_CFA[[.]], type = "box", 
              horizontal = TRUE,
              main=paste0("Conditional Permutation Importance ",
                          CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]])))

## $Lang_first_order
## NULL
## 
## $CogFlex_first_order
## NULL
## 
## $MemRe_first_order
## NULL
## 
## $g_second_order
## NULL

CFA_Resp_Var %>% 
  map(.,~plot(PI_permimp_CFA[[.]], 
              type = "bar", 
              interval="quantile",
              main=paste0("Conditional Permutation Importance ",
                          CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]])))

## $Lang_first_order
## NULL
## 
## $CogFlex_first_order
## NULL
## 
## $MemRe_first_order
## NULL
## 
## $g_second_order
## NULL

3.3.5 Compute shapley values for the G-Factor

3.3.5.1 shapley computing function

library("fastshap")

## 
## Attaching package: 'fastshap'

## The following object is masked from 'package:vip':
## 
##     gen_friedman

## The following object is masked from 'package:dplyr':
## 
##     explain

##need to test this works for all the models
model_pred_fun <- function(object, newdata) {
  pred_results <- predict(object, new_data = newdata)
return(pred_results$.pred)
  }  


model_shapley <- function(fit_input,data_input,resp_input){
   
library(doFuture)
registerDoFuture()
plan(multisession(workers = 25))
## because of the false alarm in plyr running this chunk would give
doRNG::registerDoRNG()  

 model_shap <- fit_input %>% 
   fastshap::explain(X = data_input%>%
                       select(-resp_input)
                     %>% as.data.frame(),nsim= 1000, pred_wrapper =model_pred_fun
                     ,.parallel=TRUE
                     )
  
return(model_shap)
}

3.3.5.2 extract model fit and training data from the previous codes

gfactor_random_forest_train_data <- enet_rf_valid_CFA$g_second_order$NoID
  
gfactor_random_forest_train_ID <- enet_rf_valid_CFA$g_second_order$ID$SUBJECTKEY

gfactor_random_forest_fit <- rf_fit_CFA$g_second_order$model_fit

3.3.5.3 compute shapley values

shapley_gractor <- model_shapley(fit_input= gfactor_random_forest_fit, 
                                 data_input = gfactor_random_forest_train_data,
                                 resp_input = CFA_Resp_Var[4])

3.3.5.4 shapley summary plot function

# The input of this function should have exactly 4 variables
# SUBJECTKEY that is subject ID
# The names of modality ie MID, smri, rsmri, nback, sst and dti
# shapley values
# feature values
shapley_summary_plot <- function(data_input, list_val_input){
shap_plot <-   data_input%>%
 # filter(var_names %in% vars_keep)%>%
  mutate(modality_names = fct_reorder(modality_names, shapley_values,.fun = "max"))%>%
ggplot(aes(x = modality_names, y = shapley_values, color = feature_values)) +
    coord_flip(ylim = range(data_input$shapley_values)*1.1) +
    geom_hline(yintercept = 0) + # the y-axis beneath
    # sina plot:
    ggforce::geom_sina(method = "counts", maxwidth = 0.7, alpha = 0.7)+
    scale_color_gradient(low="blue", high="red",
                         breaks=unlist(list_val_input), labels=c('min','med','max'),
guide = guide_colorbar(direction = "horizontal",barwidth = 12, barheight = 0.3))+
    theme_bw() +
    theme(axis.line.y = element_blank(),
          axis.ticks.y = element_blank(), # remove axis line
          legend.position="bottom",
          legend.direction = 'horizontal',
          legend.title=element_text(size=20),
          legend.text=element_text(size=20),
          legend.box = "horizontal",
          axis.title.x= element_text(size = 20),
          axis.text.y = element_text(size = 20),
          axis.text.x = element_text(size = 20)) +
    labs(y = "SHAP value (impact on model output)", x = "", color = "Feature value  ")
return(shap_plot)
}





shapley_plot_list <- function(shapley_input, data_input, ID_input){
  ##processing shapley output
  shapley_x_wider <- shapley_input%>%tibble::as_tibble()%>%
    dplyr::select(ends_with(".x"))%>%
    rename(MID = MID_pred.x,rs_fMRI= rsmri_pred.x, SST= SST_pred.x, 
           sMRI= smri_pred.x, NBack = Nback_pred.x, DTI = DTI_pred.x)%>%
    mutate(SUBJECTKEY=ID_input)
 
  
  shapley_x<-  shapley_x_wider%>%
    pivot_longer(-SUBJECTKEY, names_to = "modality_names", values_to = "shapley_values")
  
  
  data_x <- data_input%>%tibble::as_tibble()%>%
    dplyr::select(ends_with(".x"))%>%
    rename(MID = MID_pred.x,rs_fMRI= rsmri_pred.x, SST= SST_pred.x, 
           sMRI= smri_pred.x, NBack = Nback_pred.x, DTI = DTI_pred.x)%>%
    mutate(SUBJECTKEY=ID_input)%>%
    pivot_longer(-SUBJECTKEY, names_to = "modality_names", values_to = "feature_values")
  
  
  shapley_y_wider <- shapley_input%>%tibble::as_tibble()%>%
    dplyr::select(ends_with(".y"))%>%
    rename(MID = MID_pred.y,rs_fMRI= rsmri_pred.y, SST= SST_pred.y, 
           sMRI= smri_pred.y, NBack = Nback_pred.y, DTI = DTI_pred.y)%>%
    mutate(SUBJECTKEY=ID_input)
  shapley_y <-  shapley_y_wider%>%
    pivot_longer(-SUBJECTKEY, names_to = "modality_names", values_to = "shapley_values")
  
  
  data_y <- data_input%>%tibble::as_tibble()%>%
    dplyr::select(ends_with(".y"))%>%
    rename(MID = MID_pred.y,rs_fMRI= rsmri_pred.y, SST= SST_pred.y, 
           sMRI= smri_pred.y, NBack = Nback_pred.y, DTI = DTI_pred.y)%>%
    mutate(SUBJECTKEY=ID_input)%>%
    pivot_longer(-SUBJECTKEY, names_to = "modality_names", values_to = "feature_values")
  
  shapley_all <- bind_rows(shapley_x,shapley_y)
  data_all <- bind_rows(data_x,data_y)
  shapley_summary_tibble <- plyr::join_all(list(shapley_all,data_all), 
                               by=c("SUBJECTKEY","modality_names"),type="full")
  
  

    ##shapley summary plot
    shapley_summary_tibble_NA <- shapley_summary_tibble %>%
                          naniar::replace_with_na(replace = list(feature_values = c(-1000,1000.000000000)))
   shapley_summary_tibble_0 <- shapley_summary_tibble
      shapley_summary_tibble_0$feature_values[which(shapley_summary_tibble_0$feature_values == 1000)] <- 0
      shapley_summary_tibble_0$feature_values[which(shapley_summary_tibble_0$feature_values == -1000)] <- 0

    
   # list of functions to calculate the values where you want your breaks
    myfuns <- list(min, median, max)
    # use this list to make a list of your breaks
    list_val <- lapply(myfuns, function(f) f(shapley_summary_tibble$feature_values)) 
    list_val_0 <- lapply(myfuns, function(f) f(shapley_summary_tibble_0$feature_values))
    
  shapley_summary_plot_1k <- shapley_summary_plot(data_input = shapley_summary_tibble,
                                               list_val_input = list_val)  
  
    shapley_summary_plot_na <- shapley_summary_plot(data_input = shapley_summary_tibble_NA,
                                                    list_val_input = list_val_0)  

  
shapley_vi <- bind_rows(shapley_x_wider,shapley_y_wider)%>%
              select(-"SUBJECTKEY") %>%
              abs()%>% colSums()
              
shapley_vi_tibble <- tibble(modality_names = names(shapley_vi),
                            shapley_vi = shapley_vi)  

shapley_vi_plot <- shapley_vi_tibble %>%  mutate(modality_names = fct_reorder(modality_names, shapley_vi,.fun = "max"))%>%
  ggplot(aes(x = modality_names, y = shapley_vi))+
                      geom_bar(stat = "identity")+
                      coord_flip()+
  labs(y = "|SHAP| variable importance", x = "") +
    theme(axis.title.x= element_text(size = 20),
          axis.title.y= element_text(size = 20),
          axis.text.y = element_text(size = 20),
          axis.text.x = element_text(size = 20))
                      

return(list(summary_plot_na = shapley_summary_plot_na,
            summary_plot_1k = shapley_summary_plot_1k, 
            importance_plot = shapley_vi_plot))
}

shapley_plot_list(shapley_input = shapley_gractor,
                      data_input = gfactor_random_forest_train_data,
                      ID_input = gfactor_random_forest_train_ID)

## Registered S3 method overwritten by 'ggforce':
##   method           from 
##   scale_type.units units

## $summary_plot_na

## 
## $summary_plot_1k

## 
## $importance_plot

3.3.6 Plot obverved against predicted value for the Random Forest Stacking model for each factor score

random_forest_plotting_followup(rf_fit =rf_fit_CFA,
                                followup_data=enet_rf_test_followup_CFA, 
                                naming_frame = CFA_var_plotting, 
                                resp_vec= CFA_Resp_Var)

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

random_forest_plotting_baseline(rf_fit =rf_fit_CFA,
                                baseline_data=enet_rf_test_baseline_CFA, 
                                naming_frame =  CFA_var_plotting, 
                                resp_vec= CFA_Resp_Var)

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

3.3.7 Plot obverved against predicted value for the g factor across modalities at the Baseline

modality_ploting <- c("Stacked_All","Stacked_Complete","Stacked_Best", "Stacked_NoBest", "NBack","MID", "SST","rs_fMRI","sMRI","DTI")
resp_plotting <- c("g_second_order")

predictedObserved_GfactBaseline <-
  baseline_CFA %>% select(SUBJECTKEY, g_second_order) %>% 
  # enet_rf_test_baseline_CFA$g_second_order$ID %>% #observed:
  # select(SUBJECTKEY, g_second_order) %>% 
  full_join(rf_fit_CFA$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
  rename("Stacked_All"= pred_baseline) %>%
  full_join(enet_CFA$Nback_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
  rename("NBack" = pred) %>%
  full_join(enet_CFA$SST_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
  rename("SST" = pred) %>%
  full_join(enet_CFA$MID_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
  rename("MID" = pred) %>%
  full_join(enet_CFA$rsmri_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
  rename("rs_fMRI" = pred) %>%
  full_join(enet_CFA$smri_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
  rename("sMRI" = pred) %>%
  full_join(enet_CFA$DTI_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
  rename("DTI" = pred) 

#create a column for list wise deletion and apply z score
predictedObserved_GfactBaselineZ <- predictedObserved_GfactBaseline %>% 
  na.omit() %>% 
  select(SUBJECTKEY, "Stacked_All") %>%
  rename("Stacked_Complete" = "Stacked_All") %>% 
  right_join(predictedObserved_GfactBaseline, by = "SUBJECTKEY") 
#%>%
# mutate_at(resp_plotting, ~(scale(.) %>% as.vector))

#create 1) stacked bast = a stacked column with the same subjects with nback
#create 2) stacked others = a stacked column without the nback subjects 
predictedObserved_GfactBaselineZ <-  predictedObserved_GfactBaselineZ %>% 
  select(SUBJECTKEY, "Stacked_All", "NBack") %>%
  drop_na("NBack") %>%
  rename("Stacked_Best" = "Stacked_All") %>% select(-NBack) %>% 
  right_join(predictedObserved_GfactBaselineZ, by = "SUBJECTKEY") %>% 
  mutate(Stacked_NoBest = Stacked_All)

predictedObserved_GfactBaselineZ[which(!is.na(predictedObserved_GfactBaselineZ$NBack)), 
                                       "Stacked_NoBest"] <- NA

predictedObserved_GfactBaselineZ <- 
  predictedObserved_GfactBaselineZ[, c(1, 4, 5, 3, 2, 12, 6, 7, 8, 9, 10, 11)]

plotGFact_baseline <- list()
for (modalityNum in seq(1,length(modality_ploting),1)) 
     {
      plotGFact_baseline[[modalityNum]]  <-
        ggplot(data = predictedObserved_GfactBaselineZ,
               aes(x = scale(.data[[modality_ploting[modalityNum]]]), 
                          y = scale(.data[['g_second_order']]))) +
          geom_jitter(height = 0.1, width = 0.1, size = 1, col = 'grey60') +
          geom_smooth(method = 'lm', se = FALSE, col = 'black')  +
          labs(x = NULL,
               y = NULL,
               title = paste0(modality_ploting[modalityNum],
               ' r=',round(cor(predictedObserved_GfactBaselineZ[modality_ploting[modalityNum]],
                                  predictedObserved_GfactBaselineZ$g_second_order,
                                  use = "pairwise.complete.obs"), 3))) +
               # "\nMAE=", round(
               #      yardstick::mae_vec(truth = as.numeric(unlist(predictedObserved_GfactBaselineZ[modality_ploting[modalityNum]])),
               #                    predictedObserved_GfactBaselineZ$g_second_order,
               #                    na_rm = TRUE),3)))  +
        theme(text = element_text(size=16))
    }
figure <- 
  ggpubr::ggarrange(ggpubr::ggarrange(plotlist=plotGFact_baseline[1:4]),
                  ggpubr::ggarrange(plotlist=plotGFact_baseline[5:10]),
                  nrow = 2)

  ggpubr::annotate_figure(figure,
                top = ggpubr::text_grob("Ability to Predict G-Factor at Baseline", face = "bold", size = 29),
                bottom = ggpubr::text_grob("Predicted G-Factor (Z)", size = 30),
                left = ggpubr::text_grob("Observed G-Factor (Z)", size = 30, rot = 90)
                )

3.3.8 number of (non-)missing values

naniar::vis_miss(predictedObserved_GfactBaselineZ %>% select(-SUBJECTKEY,-g_second_order)) + 
  guides(fill = ggplot2::guide_legend(reverse = TRUE)) +
  theme(
    axis.text.x=element_text(angle =- 90,hjust = 1),
        text = element_text(size=18))

## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

colSums(!is.na(predictedObserved_GfactBaselineZ))

##       SUBJECTKEY   g_second_order      Stacked_All Stacked_Complete 
##             5487             5487             5171             1164 
##     Stacked_Best   Stacked_NoBest            NBack              SST 
##             2653             2518             2653             2791 
##              MID          rs_fMRI             sMRI              DTI 
##             3138             3839             4567             4071

3.3.9 Plot obverved against predicted value for the g factor across modalities at the Follow-Up

predictedObserved_Gfactfollowup <-
  followup_CFA %>% select(SUBJECTKEY, g_second_order) %>% 
  # enet_rf_test_baseline_CFA$g_second_order$ID %>% #observed:
  # select(SUBJECTKEY, g_second_order) %>% 
  full_join(rf_fit_CFA$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
  rename("Stacked_All"= pred_followup) %>%
  full_join(enet_CFA$Nback_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
  rename("NBack" = pred) %>%
  full_join(enet_CFA$SST_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
  rename("SST" = pred) %>%
  full_join(enet_CFA$MID_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
  rename("MID" = pred) %>%
  full_join(enet_CFA$rsmri_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
  rename("rs_fMRI" = pred) %>%
  full_join(enet_CFA$smri_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
  rename("sMRI" = pred) %>%
  full_join(enet_CFA$DTI_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
  rename("DTI" = pred) 

#create a column for list wise deletion and apply z score
predictedObserved_GfactfollowupZ <- predictedObserved_Gfactfollowup %>% 
  na.omit() %>% 
  select(SUBJECTKEY, "Stacked_All") %>%
  rename("Stacked_Complete" = "Stacked_All") %>% 
  right_join(predictedObserved_Gfactfollowup, by = "SUBJECTKEY") 
#%>%
 # mutate_at(resp_plotting, ~(scale(.) %>% as.vector))

#create 1) stacked bast = a stacked column with the same subjects with nback
#create 2) stacked others = a stacked column without the nback subjects 
predictedObserved_GfactfollowupZ <- predictedObserved_GfactfollowupZ %>% 
  select(SUBJECTKEY, "Stacked_All", "NBack") %>%
  drop_na("NBack") %>%
  rename("Stacked_Best" = "Stacked_All") %>% select(-NBack) %>% 
  right_join(predictedObserved_GfactfollowupZ, by = "SUBJECTKEY") %>% 
  mutate(Stacked_NoBest = Stacked_All)

predictedObserved_GfactfollowupZ[which(!is.na(predictedObserved_GfactfollowupZ$NBack)), 
                                       "Stacked_NoBest"] <- NA

predictedObserved_GfactfollowupZ <- 
  predictedObserved_GfactfollowupZ[, c(1, 4, 5, 3, 2, 12, 6, 7, 8, 9, 10, 11)]

plotGFact_followup <- list()
for (modalityNum in seq(1,length(modality_ploting),1)) 
     {
      plotGFact_followup[[modalityNum]]  <-
        ggplot(data = predictedObserved_GfactfollowupZ,
               aes(x = scale(.data[[modality_ploting[modalityNum]]]) , 
                          y =scale(.data[['g_second_order']]) )) +
          geom_jitter(height = 0.1, width = 0.1, size = 1, col = 'grey60') +
          geom_smooth(method = 'lm', se = FALSE, col = 'black')  +
          labs(x = NULL,
               y = NULL,
               title = paste0(modality_ploting[modalityNum],
                ' r=',round(cor(predictedObserved_GfactfollowupZ[modality_ploting[modalityNum]],
                                   predictedObserved_GfactfollowupZ$g_second_order,
                                   use = "pairwise.complete.obs"), 3)))+
               # "\nMAE=", round(
               #      yardstick::mae_vec(truth = as.numeric(unlist(predictedObserved_GfactfollowupZ[modality_ploting[modalityNum]])),
               #                    predictedObserved_GfactfollowupZ$g_second_order,
               #                    na_rm = TRUE),3)))  +
        theme(text = element_text(size=16))
    }
figure <- 
  ggpubr::ggarrange(ggpubr::ggarrange(plotlist=plotGFact_followup[1:4]),
                  ggpubr::ggarrange(plotlist=plotGFact_followup[5:10]),
                  nrow = 2)
  ggpubr::annotate_figure(figure,
                top = ggpubr::text_grob("Ability to Predict G-Factor at Follow-up", face = "bold", size = 29),
                bottom = ggpubr::text_grob("Predicted G-Factor (Z)", size = 30),
                left = ggpubr::text_grob("Observed G-Factor (Z)", size = 30, rot = 90)
                )

3.3.10 number of (non-)missing values

naniar::vis_miss(predictedObserved_GfactfollowupZ %>% select(-SUBJECTKEY,-g_second_order)) + 
  guides(fill = ggplot2::guide_legend(reverse = TRUE)) +
  theme(
    axis.text.x=element_text(angle =- 90,hjust = 1),
        text = element_text(size=20))

colSums(!is.na(predictedObserved_GfactfollowupZ))

##       SUBJECTKEY   g_second_order      Stacked_All Stacked_Complete 
##             5280             5280             5119             1374 
##     Stacked_Best   Stacked_NoBest            NBack              SST 
##             2996             2123             2996             2903 
##              MID          rs_fMRI             sMRI              DTI 
##             3127             4137             4569             4230

3.4 differences in g-factor between missing vs not missing

3.4.1 create data frame for mssing values

  predictedObserved_GfactBaselineZ_missing <-
  predictedObserved_GfactBaselineZ %>%
    mutate(Stacked_All_missing = is.na(Stacked_All)) %>%
    mutate(Stacked_Complete_missing = is.na(Stacked_Complete)) %>%
    mutate(Stacked_Best_missing = is.na(Stacked_Best)) %>%
    mutate(Stacked_NoBest_missing = is.na(Stacked_NoBest)) %>% 
    mutate(NBack_missing = is.na(NBack)) %>%
    mutate(SST_missing = is.na(SST)) %>%
    mutate(MID_missing = is.na(MID)) %>%
    mutate(rs_fMRI_missing = is.na(rs_fMRI)) %>%
    mutate(sMRI_missing = is.na(sMRI)) %>%
    mutate(DTI_missing = is.na(DTI)) 

  predictedObserved_GfactfollowupZ_missing <-
  predictedObserved_GfactfollowupZ %>%
    mutate(Stacked_All_missing = is.na(Stacked_All)) %>%
    mutate(Stacked_Complete_missing = is.na(Stacked_Complete)) %>%
    mutate(Stacked_Best_missing = is.na(Stacked_Best)) %>%
    mutate(Stacked_NoBest_missing = is.na(Stacked_NoBest)) %>% 
    mutate(NBack_missing = is.na(NBack)) %>%
    mutate(SST_missing = is.na(SST)) %>%
    mutate(MID_missing = is.na(MID)) %>%
    mutate(rs_fMRI_missing = is.na(rs_fMRI)) %>%
    mutate(sMRI_missing = is.na(sMRI)) %>%
    mutate(DTI_missing = is.na(DTI))

3.4.2 function for plotting the differences with cohen D

ttest_missing_plot <- function(missing_colume_name, missing_title, missing_response, input_df){

  ttest_formula <- as.formula(paste(missing_response, paste(missing_colume_name, collapse=" + "), sep=" ~ "))
  
  ttest_missing_gg <- input_df %>%
    ggplot(aes_string(y=missing_response, x= missing_colume_name))+
    geom_violin(trim = FALSE) + 
    stat_summary(
      fun.data = "mean_sdl",  fun.args = list(mult = 2), 
      geom = "pointrange", color = "black"
      ) + 
    scale_x_discrete(labels=c("Present", "Missing")) + 
    ggpubr::stat_pvalue_manual(
      predictedObserved_GfactBaselineZ_missing %>%
        rstatix::t_test(ttest_formula) %>%
        rstatix::add_significance() %>% 
        rstatix::add_xy_position(x = missing_colume_name)) + 
    ggtitle(paste0(missing_title, "\nCohen d = ", 
          rstatix::cohens_d(data = input_df, 
                  formula = ttest_formula, 
                  var.equal = FALSE)%>% .$effsize %>% round(digits =3)))+
    theme(axis.title.x=element_blank()) + 
    theme(axis.title.y=element_blank())  
  
  return(ttest_missing_gg)
}

3.4.3 plot the differences with cohen D for baseline and follow up

missing_vars <- predictedObserved_GfactBaselineZ_missing %>% 
  select(ends_with("missing")) %>% 
  colnames() %>% 
  as.list() 

missing_names <- predictedObserved_GfactBaselineZ_missing %>% 
  select(ends_with("missing")) %>% 
  colnames() %>% 
  str_replace("_missing", "") %>%
  as.list() 

missing_response <- replicate(length(missing_names), "g_second_order") %>% as.list() 

missing_input_baseline_df <- replicate(length(missing_names), predictedObserved_GfactBaselineZ_missing, simplify = FALSE)
missing_input_followUp_df <- replicate(length(missing_names), predictedObserved_GfactfollowupZ_missing, simplify = FALSE)

ttest_missing_plot_baseline_list <- purrr::pmap(list(missing_colume_name = missing_vars,
                 missing_title = missing_names,
                 missing_response = missing_response,
                 input_df = missing_input_baseline_df),            
            ttest_missing_plot)

title_baseline.grob <- textGrob("Baseline Test", 
                   gp=gpar(fontsize=18))

ttest_missing_plot_baseline_gg <- 
  cowplot::plot_grid(plotlist=ttest_missing_plot_baseline_list,
                  nrow =5, align='vh', vjust=1, scale = 1) 

ttest_missing_plot_baseline_gg_title <- gridExtra::grid.arrange(gridExtra::arrangeGrob(ttest_missing_plot_baseline_gg, 
                                                  top = title_baseline.grob))

ttest_missing_plot_followup_list <- purrr::pmap(list(missing_colume_name = missing_vars,
                 missing_title = missing_names,
                 missing_response = missing_response,
                 input_df = missing_input_followUp_df),            
            ttest_missing_plot)

title_followup.grob <- textGrob("Follow Up Test", 
                   gp=gpar(fontsize=18))

ttest_missing_plot_followup_gg <- 
  cowplot::plot_grid(plotlist=ttest_missing_plot_followup_list,
                  nrow =5, align='vh', vjust=1, scale = 1) 

ttest_missing_plot_followup_gg_title <- gridExtra::grid.arrange(gridExtra::arrangeGrob(ttest_missing_plot_followup_gg, 
                                                  top = title_followup.grob))

3.4.4 combine the plots for the differences with cohen D for baseline and follow up

ttest_missing_plot_both_gg <- cowplot::plot_grid(ttest_missing_plot_baseline_gg_title, 
                   ttest_missing_plot_followup_gg_title,
                  nrow =1)

title_overall.grob <- textGrob(expression(paste("Differences in ",italic("g"), "-Factor(Z) As a Function of Missing Data")), 
                   gp=gpar(fontsize=20))

y_overall.grob <- textGrob(expression(paste(italic("g"), "-factor(Z)")), 
                   gp=gpar(fontsize=18), rot = 90)

ttest_missing_plot_both_gg_title <- gridExtra::grid.arrange(gridExtra::arrangeGrob(ttest_missing_plot_both_gg, 
                                                  top = title_overall.grob,
                                                  left = y_overall.grob))

3.4.5 Creating tables fore predictive performance

get_summary_stats <- function(data_input, target_input){
 data_input <- data_input %>% 
    mutate(pred_target = as.numeric(scale(data_input$g_second_order)) ) 
 
    mae_perform <- yardstick::mae(data =data_input, 
                                truth=pred_target, estimate=data_input[[target_input]])
    rmse_perform <- yardstick::rmse(data =data_input, 
                                    truth=pred_target, estimate=data_input[[target_input]])
    cor_perform <- cor(data_input$g_second_order,
                 data_input[[target_input]],
                 use = "pairwise.complete.obs")
    tradrsq_perform <- yardstick::rsq_trad(data = data_input, 
                                         truth=pred_target, 
                                         estimate=data_input[[target_input]])
return( tibble(Models=target_input, 
               Pearson_r=round(cor_perform,3),
               R_squared=round(tradrsq_perform$.estimate,3),
               MAE=round(mae_perform$.estimate,3),
               RMSE=round(rmse_perform$.estimate,3) 
               ) )
}

elem_remove <- c("SUBJECTKEY","g_second_order")
baseline_pred_vec <- setdiff(colnames(predictedObserved_GfactBaselineZ),elem_remove) 

baseline_perform <- baseline_pred_vec %>% 
  map(.,~get_summary_stats(data_input = predictedObserved_GfactBaselineZ,
                           target_input = .))%>% 
  do.call(rbind,.)%>%
  kableExtra::kbl(caption = paste0("Predictive Performance for the baseline test set")) %>%
   kableExtra::kable_classic(full_width = F, html_font = "Cambria") %>%
   print()

Predictive Performance for the baseline test set
Models	Pearson_r	R_squared	MAE	RMSE
Stacked_All	0.439	0.191	0.699	0.895
Stacked_Complete	0.429	0.183	0.610	0.780
Stacked_Best	0.442	0.195	0.620	0.798
Stacked_NoBest	0.296	0.085	0.783	0.987
NBack	0.402	0.072	0.664	0.857
SST	0.129	-0.033	0.744	0.950
MID	0.202	0.013	0.738	0.944
rs_fMRI	0.233	0.042	0.749	0.955
sMRI	0.208	0.040	0.763	0.969
DTI	0.190	0.033	0.757	0.972

followup_pred_vec <- setdiff(colnames(predictedObserved_GfactfollowupZ)
                             ,elem_remove) 

followup_perform <- followup_pred_vec %>% 
  map(.,~get_summary_stats(data_input = predictedObserved_GfactfollowupZ,
                           target_input = .))%>% 
  do.call(rbind,.)%>%
  kableExtra::kbl(caption = paste0("Predictive Performance for the followup test set")) %>%
   kableExtra::kable_classic(full_width = F, html_font = "Cambria") %>%
   print()

Predictive Performance for the followup test set
Models	Pearson_r	R_squared	MAE	RMSE
Stacked_All	0.414	0.166	0.719	0.913
Stacked_Complete	0.427	0.168	0.651	0.829
Stacked_Best	0.438	0.175	0.666	0.846
Stacked_NoBest	0.317	0.100	0.794	1.000
NBack	0.383	0.118	0.687	0.875
SST	0.145	-0.004	0.760	0.961
MID	0.148	-0.003	0.757	0.955
rs_fMRI	0.251	0.055	0.754	0.954
sMRI	0.226	0.049	0.764	0.965
DTI	0.212	0.045	0.771	0.978

3.4.6 function to bootstrap the differences between Stack Best vs N Back

perfDiff = function(data, indices){
  sample = data[indices,]
  
  cor_Stack <- cor(sample$g_second_order,
                 sample$Stacked_Best,
                 use = "pairwise.complete.obs")
  
  cor_NBack <- cor(sample$g_second_order,
                 sample$NBack,
                 use = "pairwise.complete.obs")
  
  tradrsq_Stack <- yardstick::rsq_trad(data =sample, 
                              truth=g_second_order, 
                              estimate=Stacked_Best)

  tradrsq_NBack <- yardstick::rsq_trad(data =sample, 
                              truth=g_second_order, 
                              estimate=NBack)
  
  mae_Stack <- yardstick::mae(data =sample, 
                              truth=g_second_order, 
                              estimate=Stacked_Best)
  
  mae_NBack <- yardstick::mae(data =sample, 
                              truth=g_second_order, 
                              estimate=NBack)
  
  rmse_Stack <- yardstick::rmse(data =sample, 
                              truth=g_second_order, 
                              estimate=Stacked_Best)
  
  rmse_NBack <- yardstick::rmse(data =sample, 
                              truth=g_second_order, 
                              estimate=NBack)
 
  cor_StackVSNBack = cor_Stack - cor_NBack 
  tradrsq_StackVSNBack = tradrsq_Stack$.estimate - tradrsq_NBack$.estimate
  mae_StackVSNBack = mae_Stack$.estimate - mae_NBack$.estimate
  rmse_StackVSNBack = rmse_Stack$.estimate - rmse_NBack$.estimate
  
  StackVSNBack = c(cor_StackVSNBack, tradrsq_StackVSNBack, mae_StackVSNBack, rmse_StackVSNBack)
  
  return(StackVSNBack)
}

3.4.7 actualy bootstraping for the baseline

StackBestVSBaseline <- predictedObserved_GfactBaselineZ %>% 
  select(g_second_order,Stacked_Best,NBack) %>% na.omit()

set.seed(12345)
DiffResultsBaseline <- boot::boot(data = StackBestVSBaseline,
           statistic = perfDiff,
           R = 5000)

#"Bias Corrected and Accelerated"
ci_r_baseline <- boot::boot.ci(DiffResultsBaseline, index = 1, conf = 0.95, type = 'bca')
ci_rsq_baseline <-boot::boot.ci(DiffResultsBaseline, index = 2, conf = 0.95, type = 'bca')
ci_mae_baseline <- boot::boot.ci(DiffResultsBaseline, index = 3, conf = 0.95, type = 'bca')
ci_rmse_baseline <- boot::boot.ci(DiffResultsBaseline, index = 4, conf = 0.95, type = 'bca')

ci_r_baseline

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
## 
## CALL : 
## boot::boot.ci(boot.out = DiffResultsBaseline, conf = 0.95, type = "bca", 
##     index = 1)
## 
## Intervals : 
## Level       BCa          
## 95%   ( 0.0206,  0.0585 )  
## Calculations and Intervals on Original Scale

ci_rsq_baseline

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
## 
## CALL : 
## boot::boot.ci(boot.out = DiffResultsBaseline, conf = 0.95, type = "bca", 
##     index = 2)
## 
## Intervals : 
## Level       BCa          
## 95%   ( 0.0727,  0.1393 )  
## Calculations and Intervals on Original Scale

ci_mae_baseline

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
## 
## CALL : 
## boot::boot.ci(boot.out = DiffResultsBaseline, conf = 0.95, type = "bca", 
##     index = 3)
## 
## Intervals : 
## Level       BCa          
## 95%   (-0.0396, -0.0157 )  
## Calculations and Intervals on Original Scale

ci_rmse_baseline

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
## 
## CALL : 
## boot::boot.ci(boot.out = DiffResultsBaseline, conf = 0.95, type = "bca", 
##     index = 4)
## 
## Intervals : 
## Level       BCa          
## 95%   (-0.0516, -0.0270 )  
## Calculations and Intervals on Original Scale

StackBestVSfollowup <- predictedObserved_GfactfollowupZ %>% 
  select(g_second_order,Stacked_Best,NBack) %>% na.omit()

3.4.8 actualy bootstraping for the follow up

set.seed(12345)
DiffResultsfollowup <- boot::boot(data = StackBestVSfollowup,
                                  statistic = perfDiff,
                                  R = 5000)

#"Bias Corrected and Accelerated"
ci_r_followup <- boot::boot.ci(DiffResultsfollowup, index = 1, conf = 0.95, type = 'bca')
ci_rsq_followup <-boot::boot.ci(DiffResultsfollowup, index = 2, conf = 0.95, type = 'bca')
ci_mae_followup <- boot::boot.ci(DiffResultsfollowup, index = 3, conf = 0.95, type = 'bca')
ci_rmse_followup <- boot::boot.ci(DiffResultsfollowup, index = 4, conf = 0.95, type = 'bca')

ci_r_followup

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
## 
## CALL : 
## boot::boot.ci(boot.out = DiffResultsfollowup, conf = 0.95, type = "bca", 
##     index = 1)
## 
## Intervals : 
## Level       BCa          
## 95%   ( 0.0359,  0.0732 )  
## Calculations and Intervals on Original Scale

ci_rsq_followup

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
## 
## CALL : 
## boot::boot.ci(boot.out = DiffResultsfollowup, conf = 0.95, type = "bca", 
##     index = 2)
## 
## Intervals : 
## Level       BCa          
## 95%   ( 0.0126,  0.0745 )  
## Calculations and Intervals on Original Scale

ci_mae_followup

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
## 
## CALL : 
## boot::boot.ci(boot.out = DiffResultsfollowup, conf = 0.95, type = "bca", 
##     index = 3)
## 
## Intervals : 
## Level       BCa          
## 95%   (-0.0211,  0.0020 )  
## Calculations and Intervals on Original Scale

ci_rmse_followup

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
## 
## CALL : 
## boot::boot.ci(boot.out = DiffResultsfollowup, conf = 0.95, type = "bca", 
##     index = 4)
## 
## Intervals : 
## Level       BCa          
## 95%   (-0.0294, -0.0050 )  
## Calculations and Intervals on Original Scale

3.4.9 plot bootstrapped distribution of Stacked Best > N-Back in the Baseline Test Set

r_plot_baseline <- as_tibble(DiffResultsBaseline$t[,1]) %>%
  ggplot() + geom_density(aes(x = value)) + 
  ggtitle("Pearson's r") +
  geom_vline(aes(xintercept=ci_r_baseline$bca[4]), color="red", linetype="dashed") +
  geom_vline(aes(xintercept=ci_r_baseline$bca[5]), color="red", linetype="dashed") +
    theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        text = element_text(size=20))

rsq_plot_baseline <- as_tibble(DiffResultsBaseline$t[,2]) %>%
  ggplot() + geom_density(aes(x = value)) +
  ggtitle("R squared") +
  geom_vline(aes(xintercept=ci_rsq_baseline$bca[4]), color="red", linetype="dashed") +
  geom_vline(aes(xintercept=ci_rsq_baseline$bca[5]), color="red", linetype="dashed") +
    theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        text = element_text(size=20))

mae_plot_baseline <- as_tibble(DiffResultsBaseline$t[,3]) %>%
  ggplot() + geom_density(aes(x = value)) + 
  ggtitle("MAE") +
  geom_vline(aes(xintercept=ci_mae_baseline$bca[4]), color="red", linetype="dashed") +
  geom_vline(aes(xintercept=ci_mae_baseline$bca[5]), color="red", linetype="dashed") +
    theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        text = element_text(size=20))

rmse_plot_baseline <- as_tibble(DiffResultsBaseline$t[,4]) %>%
  ggplot() + geom_density(aes(x = value)) + 
  ggtitle("RMSE") +
  geom_vline(aes(xintercept=ci_rmse_baseline$bca[4]), color="red", linetype="dashed") +
  geom_vline(aes(xintercept=ci_rmse_baseline$bca[5]), color="red", linetype="dashed") +
    theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        text = element_text(size=20))

boot_baseline <- 
  ggpubr::ggarrange(plotlist= 
                    list(r_plot_baseline, 
                         rsq_plot_baseline, 
                         mae_plot_baseline, 
                         rmse_plot_baseline)) %>% 
  ggpubr::annotate_figure(top = ggpubr::text_grob("Baseline Test Set",
                                                     size=22,
                                                     face = "bold"))

3.4.10 plot bootstrapped distribution of Stacked Best > N-Back in the followup Test Set

r_plot_followup <- as_tibble(DiffResultsfollowup$t[,1]) %>%
  ggplot() + geom_density(aes(x = value)) + 
  ggtitle("Pearson's r") +
  geom_vline(aes(xintercept=ci_r_followup$bca[4]), color="red", linetype="dashed") +
  geom_vline(aes(xintercept=ci_r_followup$bca[5]), color="red", linetype="dashed") +
  theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        text = element_text(size=20))

rsq_plot_followup <- as_tibble(DiffResultsfollowup$t[,2]) %>%
  ggplot() + geom_density(aes(x = value)) +
  ggtitle("R squared") +
  geom_vline(aes(xintercept=ci_rsq_followup$bca[4]), color="red", linetype="dashed") +
  geom_vline(aes(xintercept=ci_rsq_followup$bca[5]), color="red", linetype="dashed") +
  theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        text = element_text(size=20))

mae_plot_followup <- as_tibble(DiffResultsfollowup$t[,3]) %>%
  ggplot() + geom_density(aes(x = value)) + 
  ggtitle("MAE") +
  geom_vline(aes(xintercept=ci_mae_followup$bca[4]), color="red", linetype="dashed") +
  geom_vline(aes(xintercept=ci_mae_followup$bca[5]), color="red", linetype="dashed") +
  theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        text = element_text(size=20))

rmse_plot_followup <- as_tibble(DiffResultsfollowup$t[,4]) %>%
  ggplot() + geom_density(aes(x = value)) + 
  ggtitle("RMSE") +
  geom_vline(aes(xintercept=ci_rmse_followup$bca[4]), color="red", linetype="dashed") +
  geom_vline(aes(xintercept=ci_rmse_followup$bca[5]), color="red", linetype="dashed") +
  theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        text = element_text(size=20))

boot_followup <- ggpubr::ggarrange(plotlist= 
                                     list(r_plot_followup, rsq_plot_followup, mae_plot_followup, rmse_plot_followup)) %>% 
  ggpubr::annotate_figure(top = ggpubr::text_grob("Follow-up Test Set",
                                                     size=22,
                                                     face = "bold"))

3.4.11 combine bootstrapped distribution plots

ggpubr::ggarrange(boot_baseline, boot_followup,
                  nrow=2) %>%
  ggpubr::annotate_figure(top = 
                            ggpubr::text_grob("Bootstrapped Distribution Stacked Best > N-Back",
                                                     size=24,
                                                     face = "bold"))

4 Leave one site out cross validation

To test if the model generalise across sites, we used the baseline data, fit the model to all but one site and test the results at hold-out site.

4.1 site up the folds

After leaving one site as a test, we 50/50 divided the rest into train (i.e,. 1st layer) and validate (i.e., 2nd layer). This process then was repeated until we finished all of held-out sites.

baseline_site <- baseline_analysis %>% drop_na(SITE_ID_L) 
unique_site <- length(unique(baseline_site$SITE_ID_L))

site_vfold <- group_vfold_cv(data = baseline_site, group = "SITE_ID_L", v= unique_site)
## the following two lines take the split results and transform it into a data frame
## so that all the results can be checked.
#model_data_1 <- site_vfold$splits[[1]]%>% analysis()
#holdout__data_1 <- site_vfold$splits[[1]] %>% assessment()

all_sample <- site_vfold$id

all_site <-  all_sample %>% 
  map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) %>% 
  do.call(rbind,.)

all_site <- as.vector(all_site)
all_site_number <- str_remove_all(all_site,"site")

###divide the sampled data into train valid and test sets
#after leave one site, the rest was devided into 2: train and validate

 train_valid_split <- all_sample %>% 
   map(.,~vfold_cv(data=analysis(site_vfold$splits[[which(site_vfold$id==.)]]), 
                   v=2,repeats = 1))
 
names(train_valid_split)<- all_sample %>% 
  map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))

test_site_split <- all_sample %>% 
  map(.,~assessment(site_vfold$splits[[which(site_vfold$id==.)]]))

names(test_site_split)<- all_sample %>% 
  map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))

4.2 Fitting elastic net to the leave-one-site-out cross validation

Note, for cross site validation, we only focus on the baseline data. We didn’t use the followup data here.

4.2.1 set up the data and adjust for batch effects

enet_site_nocom <- 
  function(resp_var,
           split_train = analysis( data_two_fold$splits[[1]]),
           split_valid = assessment(data_two_fold$splits[[1]]),
           split_test=data_fold_test, 
           contrast_name){
    
              data_train <- split_train%>% 
                select(all_of(resp_var),
                       all_of(subj_info),
                       starts_with(contrast_name))%>%
                drop_na()%>% IQR_remove()
              
              data_valid <- split_valid%>% 
                select(all_of(resp_var),
                       all_of(subj_info),
                       starts_with(contrast_name))%>%
                drop_na()%>% IQR_remove()
              
              test_set <- split_test%>% 
                select(all_of(resp_var),
                       all_of(subj_info),
                       starts_with(contrast_name))%>%
                drop_na()%>% IQR_remove()
              
              ols_data_train <- 
                select(data_train,
                       resp_var, 
                       starts_with(contrast_name))
              
              ols_data_valid <- 
                select(data_valid,
                       resp_var,
                       starts_with(contrast_name))
              ols_test_set <- select(test_set,
                                     resp_var,
                                     starts_with(contrast_name))
              
    return(list(ols_data_train=ols_data_train,
              ols_data_valid=ols_data_valid,
              ols_test_set=ols_test_set,
              data_valid_id=data_valid$SUBJECTKEY,
              test_set_id=test_set$SUBJECTKEY,
              data_train_id=data_train$SUBJECTKEY))
                              }


enet_site_com <- function(resp_var,
                          split_train = analysis( data_two_fold$splits[[1]]),
                          split_valid = assessment(data_two_fold$splits[[1]]),
                          split_test=data_fold_test, 
                          measure_one,
                          measure_two){
  data_train <-split_train%>% 
    select(all_of(resp_var),
           all_of(subj_info),
           starts_with(measure_one),
           starts_with(measure_two))%>%
    drop_na()%>% IQR_remove()
 data_valid <- split_valid%>%
    select(all_of(resp_var),
           all_of(subj_info),
           starts_with(measure_one),
           starts_with(measure_two))%>%
   drop_na()%>% IQR_remove()
  test_set <- split_test%>% 
    select(all_of(resp_var),
           all_of(subj_info),
           starts_with(measure_one),
           starts_with(measure_two))%>%
    drop_na()%>% IQR_remove()
  ols_data_train <- batch_adjust(data_fold = data_train,
                                 resp = resp_var, 
                                 x=measure_one, 
                                 y=measure_two)
  ols_data_valid <- batch_adjust(data_fold = data_valid,
                                 resp = resp_var, 
                                 x=measure_one, 
                                 y= measure_two)
  ols_test_set <- select(test_set, 
                         resp_var,
                         starts_with(measure_one), 
                         starts_with(measure_two) ) 
  return(list(ols_data_train=ols_data_train,
              ols_data_valid=ols_data_valid,
              ols_test_set=ols_test_set,
              data_valid_id=data_valid$SUBJECTKEY,
              test_set_id=test_set$SUBJECTKEY,
              data_train_id=data_train$SUBJECTKEY))
}

# for models that require Combat that has one type of data. so far this was used for DTI only. This is bc DTI only one prefix ("fa")
enet_site_comOne <- function(resp_var,
                             split_train = analysis( data_two_fold$splits[[1]]),
                              split_valid = assessment(data_two_fold$splits[[1]]),
                             split_test=data_fold_test, measure_one){
  
  data_train <-split_train%>% 
    select(all_of(resp_var),
           all_of(subj_info),
           starts_with(measure_one))%>%
    drop_na()%>% 
    IQR_remove()
  
 data_valid <- split_valid%>%
    select(all_of(resp_var),
           all_of(subj_info),
           starts_with(measure_one))%>%
   drop_na()%>% 
   IQR_remove()
 
  test_set <- split_test%>% 
    select(all_of(resp_var),
           all_of(subj_info),
           starts_with(measure_one))%>%
    drop_na()%>% IQR_remove()
  ols_data_train <- batch_adjustOne(data_fold = data_train,
                                 resp = resp_var, x=measure_one)
  ols_data_valid <- batch_adjustOne(data_fold = data_valid,
                                 resp = resp_var, x=measure_one)
  ols_test_set <- select(test_set, 
                         resp_var,
                         starts_with(measure_one)) 
  return(list(ols_data_train=ols_data_train,
              ols_data_valid=ols_data_valid,
              ols_test_set=ols_test_set,
              data_valid_id=data_valid$SUBJECTKEY,
              test_set_id=test_set$SUBJECTKEY,
              data_train_id=data_train$SUBJECTKEY))
}

4.2.2 functions to prep the data and the model before the actual modeling

enet_tuning_site <- 
  function(resp_var, data_list){
     #recipe train and test data are from ols as the variables are the same
    norm_recipe <-recipe(as.formula(paste0(resp_var, "~ .")), 
                      data =data_list[["ols_data_train"]] ) %>%
                  step_dummy(all_nominal()) %>%
                  step_center(all_predictors()) %>%
                  step_scale(all_predictors())  %>% 
                  # estimate the means and standard deviations
                  prep(training = data_list[["ols_data_train"]], retain = TRUE)

## model
glmn_mod <- linear_reg(penalty = tune() ,mixture =tune()) %>% 
  set_engine("glmnet") 
set.seed(123456)
all_rs <- rsample::vfold_cv(data=data_list[["ols_data_train"]],v = 10,repeats = 1)
glmn_wfl <- workflow()%>%
  add_recipe(norm_recipe)%>%
  add_model(glmn_mod)
## penalty is lambda, mixture is alpha
## log10 transformed to get lambda between 10^-10 to 10^1
## this expands the default values (10^-10 to 10^0) from tidymodels https://github.com/tidymodels/dials/issues/140
glmn_set <- dials::parameters(penalty(range = c(-10,1), trans = log10_trans()),mixture())
## 100 levels of lambda and 11 levels of alpha (0, .1 to 1) # use 100 level as opposed to 200 to reduce the use ofram  
glmn_grid <- grid_regular(glmn_set, levels = c(100,11))
ctrl <- control_grid(save_pred = TRUE, verbose = TRUE)

glmn_tune <-tune::tune_grid(glmn_wfl,
            resamples = all_rs,
            grid = glmn_grid,
            metrics = metric_set(mae),##use traditional r square because there is no correlation
            control = ctrl)
##select the best tuned lambda and alpha
best_glmn <- select_best(glmn_tune, metric = "mae")
## use the best tunes parameters to fit the test data
glmn_wfl_final <- 
  glmn_wfl %>%
  finalize_workflow(best_glmn) %>%
  parsnip::fit(data = data_list[["ols_data_train"]])
### this is the prediction for random forest tuning
pred_glmn <- predict(glmn_wfl_final, new_data =data_list[["ols_data_valid"]])
###this is the prediction for test set
pred_test <- predict(glmn_wfl_final, new_data =data_list[["ols_test_set"]])
##compute the correlation between predicted results and the test data 
performance_glmn <- cor(pred_glmn$.pred,data_list[["ols_data_valid"]][[resp_var]])
performance_test <- cor(pred_test$.pred,data_list[["ols_test_set"]][[resp_var]])
best_glmn <- best_glmn %>% mutate(performance=performance_glmn,test_performance = performance_test)
return(list("param_tune"=best_glmn, 
            "predicted"=tibble(SUBJECTKEY=data_list[["data_valid_id"]],
                               "pred"=pred_glmn$.pred),
            "predicted_test"=tibble(SUBJECTKEY=data_list[["test_set_id"]], 
                                    "pred"=pred_test$.pred),
            "obs_train"=length(data_list[["data_train_id"]]),
            "obs_valid"=length(data_list[["data_valid_id"]])))
}

4.2.3 Combine the predicted values with Random Forest Oppornistic Stacking for each cognitive task across sites

  rnd_for_data_site = function(pred_select = "predicted",
                               data_list,site_idx,
                               resp_chose,
                               resp_data){
enet_pred <- 
  plyr::join_all(list(data_list[["MID_site"]][[resp_chose]][[site_idx]][[pred_select]]%>%
                        dplyr::rename("MID_pred"="pred"),
                      data_list[["smri_site"]][[resp_chose]][[site_idx]][[pred_select]]%>% 
                        dplyr::rename("smri_pred"="pred"),
                      data_list[["SST_site"]][[resp_chose]][[site_idx]][[pred_select]]%>% 
                        dplyr::rename("SST_pred"="pred"),
                      data_list[["rsmri_site"]][[resp_chose]][[site_idx]][[pred_select]]%>% 
                        dplyr::rename("rsmri_pred"="pred"),
                      data_list[["Nback_site"]][[resp_chose]][[site_idx]][[pred_select]]%>% 
                        dplyr::rename("Nback_pred"="pred"),
                      data_list[["DTI_site"]][[resp_chose]][[site_idx]][[pred_select]]%>% 
                        dplyr::rename("DTI_pred"="pred")),
                 by='SUBJECTKEY', type='full') %>% 
  distinct(SUBJECTKEY, .keep_all = TRUE)   

enet_pred_p <- enet_pred%>% mutate_all(~replace(., is.na(.), 1000))
enet_pred_n <- enet_pred%>% mutate_all(~replace(., is.na(.), -1000))

enet_pred_noNA <- left_join(enet_pred_p,
                            enet_pred_n,
                            by="SUBJECTKEY")
resp_data <- resp_data %>% 
  select("SUBJECTKEY",resp_chose)%>% 
  distinct(SUBJECTKEY, .keep_all = TRUE) %>%    
  drop_na()%>% 
  mutate_if(is.double, ~ (.x - mean(.x)) / sd(.x))

enet_pred_noNA <- left_join(enet_pred_noNA,resp_data,by="SUBJECTKEY")
enet_pred_noNA_noID <- select(enet_pred_noNA,-SUBJECTKEY)

return(list("ID"=enet_pred_noNA,"NoID"=enet_pred_noNA_noID))
}

4.2.4 random forest model fit for each cognitive task across sites

random_forest_site <- function(resp,valid_list,test_list,site_chose){
  ### to avoid repetition of coding a list is created with one data frame with ID and another without
rf_train_data <- valid_list[[resp]][[site_chose]]
rf_test_data <- test_list[[resp]][[site_chose]]

resp_var <- resp
##data recipe, assign "SUBJECTKEY" to ID (a class of variables neither belong to predictors nor response variable)
## do not scale the input data as 1000 and -1000 are put there on purpose
forest_rec <- recipe(as.formula(paste0(resp_var, "~ .")), data = rf_train_data[["NoID"]]) %>%
  step_dummy(all_nominal())   %>% 
    prep(training = rf_train_data[["NoID"]], retain = TRUE)
## mtry is the number of predictors to sample at each split
## min_n (the number of observations needed to keep splitting nodes)
tune_spec <- rand_forest(
  mtry = tune(),
  trees = 1000,
  min_n = tune()
) %>%
  set_mode("regression") %>%
  set_engine("ranger")
tune_wf <- workflow() %>%
  add_recipe(forest_rec) %>%
  add_model(tune_spec)
set.seed(123456)
## nested data validation 
all_rs <- rsample::vfold_cv(data=rf_train_data[["ID"]],v = 10,repeats = 1)
## automate generate grid for hyperparameters


rf_grid <- grid_regular(
  mtry(range = c(1, 12)), #12 is the highest number of predictors 
  min_n(range = c(1, 500)), #500 is enough based on earlier analyses
  levels = c(mtry = 12, min_n = 51) #mtry = 10 so that it increases one step at a time 
)


## dofuture package is imported by package furrr
doParallel::registerDoParallel()
future::plan(multisession,workers= all_cores)
tune_res <- tune_grid(
  tune_wf,
  resamples = all_rs,
  metrics = metric_set(rmse),
  grid = rf_grid
)
best_tune <- select_best(tune_res, metric = "rmse")

final_rf <- finalize_model(
  tune_spec,
  best_tune
)

forest_prep <- prep(forest_rec)
## this one fit the random forest model with the best tuned parameters
rf_fit <- final_rf %>%
  set_engine("ranger") %>%
 parsnip::fit(as.formula(paste0(resp_var, "~ .")),
    data = juice(forest_prep) 
  )
pred_rf <- predict(rf_fit,new_data = rf_test_data[["NoID"]])
performance_rf <- cor(pred_rf$.pred,rf_test_data[["ID"]][[resp]])
best_tune <- best_tune %>% mutate(performance=performance_rf)
result_list <- list("param_tune"=best_tune, "predicted"=tibble(SUBJECTKEY=rf_test_data[["ID"]]$SUBJECTKEY, "pred"=pred_rf$.pred),"model_fit"=rf_fit)
return(result_list)
}

4.2.5 extracting cross site performance for each cognitive task

change_modality_names <- function(performance_frame){
  performance_frame$modality[which(performance_frame$modality=="rsmri")]<- "rs_fMRI" 
performance_frame$modality[which(performance_frame$modality=="smri")]<- "sMRI"
performance_frame$modality[which(performance_frame$modality=="random_forest")]<- "Stacked"
performance_frame$modality[which(performance_frame$modality=="Nback")]<- "NBack"
performance_frame <- performance_frame%>%
  mutate(modality= as.factor(modality))%>%
  mutate(modality = factor(modality,levels =c ("Stacked","NBack","MID", "SST","rs_fMRI","sMRI","DTI")))
}

4.3 Leave one site out cross validation for the G factor

CFA_lav_response_site <- function(data_input = baseline_two_fold$splits[[1]], test_baseline =baseline_test){
  ## training the model and for enet parameter tuning
data_train <- analysis(data_input)%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
## random forest tuning
data_valid <- assessment(data_input)%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
##  test set for baseline observations
data_baseline <- test_baseline%>%
              select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
### first order CFA model

NeuroCog1stOrder <-'
Language =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED 
CognitiveFlexibity =~ NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED 
MemoryRecall =~ NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC'

NeuroCog1stOrder.Fit <- lavaan::cfa(model = NeuroCog1stOrder, data = data_train,estimator="MLR")

### second order CFA model

NeuroCog2ndOrder <-'
Language =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED 
CognitiveFlexibity =~ NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED 
MemoryRecall =~ NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC
g =~ NA*Language + CognitiveFlexibity  + MemoryRecall #estimate the loading of GenAbi -> as opposed to using it as a marker
g ~~ 1*g #need to constrain variance to 1'

NeuroCog2ndOrder.Fit <- lavaan::cfa(model = NeuroCog2ndOrder, data = data_train,estimator="MLR")

NeuroCogSingleFactor <-'
g =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED + NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED + NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC'

NeuroCogSingleFactor.Fit <- lavaan::cfa(model = NeuroCogSingleFactor, data = data_train,estimator="MLR")

train_CFA_lav <- CFA_lav_output_processing(data_input = data_train,
                                           CFA_model1 = NeuroCog1stOrder.Fit,
                                           CFA_model2 = NeuroCog2ndOrder.Fit, 
                                           CFA_model3 = NeuroCogSingleFactor.Fit)
valid_CFA_lav <- CFA_lav_output_processing(data_input = data_valid,
                                           CFA_model1 = NeuroCog1stOrder.Fit,
                                           CFA_model2 = NeuroCog2ndOrder.Fit, 
                                           CFA_model3 = NeuroCogSingleFactor.Fit)
baseline_CFA_lav <- CFA_lav_output_processing(data_input = data_baseline,
                                              CFA_model1 = NeuroCog1stOrder.Fit,
                                              CFA_model2 = NeuroCog2ndOrder.Fit, 
                                              CFA_model3 = NeuroCogSingleFactor.Fit)

output_list <- list(enet_tuning = train_CFA_lav, RanFor_tuning = valid_CFA_lav, testing_baseline = baseline_CFA_lav)

return(list(output_list, NeuroCog1stOrder.Fit, NeuroCog2ndOrder.Fit,NeuroCogSingleFactor.Fit))
}

CFA_list_site <- all_site %>% map(.,~CFA_lav_response_site(data_input = train_valid_split[[.]][["splits"]][[1]], 
                                                           test_baseline =test_site_split[[.]])) 

names(CFA_list_site) <- all_site

CFA_resp_site <- c("g_second_order" )

4.3.1 functions for running enet across sites

Nback_cross_site_CFA <- function(resp_chosen){
 # doParallel::registerDoParallel()
### map over all sites instead of one single response variable
  
  Nback_site_data <- all_site %>% 
    map(.,~enet_site_nocom(resp_var = resp_chosen,
                           split_train = left_join(analysis(train_valid_split[[.]]$splits[[1]]),
                                                   CFA_list_site[[.]][[1]][["enet_tuning"]],
                                                   by="SUBJECTKEY")%>% 
                             distinct(SUBJECTKEY, .keep_all = TRUE)  ,
                           split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
                                                 CFA_list_site[[.]][[1]][["RanFor_tuning"]],
                                                 by="SUBJECTKEY")%>% 
                             distinct(SUBJECTKEY, .keep_all = TRUE)   ,
                           split_test = left_join(test_site_split[[.]],
                                                  CFA_list_site[[.]][[1]][["testing_baseline"]],
                                                  by="SUBJECTKEY")%>% 
                             distinct(SUBJECTKEY, .keep_all = TRUE) , 
                           contrast_name = "X2backvs0back_ROI_"))

  names(Nback_site_data) <- all_sample %>% 
    future_map (.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L),
              .options = furrr_options(seed=TRUE),
              .progress = TRUE) 

 Nback_site <- all_site %>%
   map(.,~enet_tuning_site(resp_var =resp_chosen,
                           data_list = Nback_site_data[[.]]))

names(Nback_site)<- all_sample %>% 
  map (.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 

return(Nback_site)
}

SST_cross_site_CFA <- function(resp_chosen){
#  doParallel::registerDoParallel()
### map over all sites instead of one single response variable

SST_site_data <- all_site %>% 
  map(.,~enet_site_nocom(resp_var = resp_chosen,
                         split_train = left_join(analysis(train_valid_split[[.]]$splits[[1]]),
                                                 CFA_list_site[[.]][[1]][["enet_tuning"]],
                                                 by="SUBJECTKEY")%>% 
                           distinct(SUBJECTKEY, .keep_all = TRUE)  ,
                           split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
                                                 CFA_list_site[[.]][[1]][["RanFor_tuning"]],
                                                 by="SUBJECTKEY")%>% 
                           distinct(SUBJECTKEY, .keep_all = TRUE)   ,
                           split_test = left_join(test_site_split[[.]],
                                                  CFA_list_site[[.]][[1]][["testing_baseline"]],
                                                  by="SUBJECTKEY")%>% 
                           distinct(SUBJECTKEY, .keep_all = TRUE) ,
                         contrast_name = "anystopvscorrectgo_ROI_"))

 names(SST_site_data) <- all_sample %>% 
   map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 

SST_site <- all_site %>% 
  map(.,~enet_tuning_site(resp_var =resp_chosen,
                          data_list = SST_site_data[[.]]))

 names(SST_site)<- all_sample %>% 
   map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 

return(SST_site)
}

MID_cross_site_CFA <- function(resp_chosen){
 # doParallel::registerDoParallel()
### map over all sites instead of one single response variable
MID_site_data <- all_site %>% 
  map(.,~enet_site_nocom(resp_var = resp_chosen,
                         split_train =left_join(analysis(train_valid_split[[.]]$splits[[1]]),
                                                CFA_list_site[[.]][[1]][["enet_tuning"]],
                                                by="SUBJECTKEY")%>% 
                           distinct(SUBJECTKEY, .keep_all = TRUE)  ,
                           split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
                                                 CFA_list_site[[.]][[1]][["RanFor_tuning"]],
                                                 by="SUBJECTKEY")%>% 
                           distinct(SUBJECTKEY, .keep_all = TRUE)   ,
                           split_test = left_join(test_site_split[[.]],
                                                  CFA_list_site[[.]][[1]][["testing_baseline"]],
                                                  by="SUBJECTKEY")%>% 
                           distinct(SUBJECTKEY, .keep_all = TRUE) ,
                         contrast_name = "antiRewVsNeu_ROI_")) 

names(MID_site_data)<- all_sample %>% 
  map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 

MID_site <- all_site %>% 
  map(.,~enet_tuning_site(resp_var =resp_chosen,
                          data_list = MID_site_data[[.]]))

names(MID_site) <- all_sample %>% 
  map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 

return(MID_site)
}


smri_cross_site_CFA <- function(resp_chosen){
 # doParallel::registerDoParallel()
### map over all sites instead of one single response variable
smri_site_data <- all_site %>% 
  map(.,~enet_site_com(resp_var = resp_chosen,
                       split_train = left_join(analysis(train_valid_split[[.]]$splits[[1]]),
                                               CFA_list_site[[.]][[1]][["enet_tuning"]],
                                               by="SUBJECTKEY")%>% 
                         distinct(SUBJECTKEY, .keep_all = TRUE)  ,
                           split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
                                                 CFA_list_site[[.]][[1]][["RanFor_tuning"]],
                                                 by="SUBJECTKEY")%>% 
                         distinct(SUBJECTKEY, .keep_all = TRUE)   ,
                           split_test = left_join(test_site_split[[.]],
                                                  CFA_list_site[[.]][[1]][["testing_baseline"]],
                                                  by="SUBJECTKEY")%>% 
                         distinct(SUBJECTKEY, .keep_all = TRUE) ,
                       measure_one = "ASEG_",
                       measure_two = "Dest_Thick_"))

names(smri_site_data)<- all_sample %>% 
  map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 
smri_site <- all_site %>% 
  map(.,~enet_tuning_site(resp_var =resp_chosen,
                          data_list = smri_site_data[[.]]))
names(smri_site)<- all_sample %>% 
  map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 
return(smri_site)
}


rsmri_cross_site_CFA <- function(resp_chosen){
#  doParallel::registerDoParallel()
### map over all sites instead of one single response variable
rsmri_site_data <- all_site %>% 
  map(.,~enet_site_com(resp_var = resp_chosen,
                       split_train = left_join(analysis(train_valid_split[[.]]$splits[[1]]),
                                               CFA_list_site[[.]][[1]][["enet_tuning"]],
                                               by="SUBJECTKEY")%>% 
                         distinct(SUBJECTKEY, .keep_all = TRUE)  ,
                           split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
                                                 CFA_list_site[[.]][[1]][["RanFor_tuning"]],
                                                 by="SUBJECTKEY")%>%
                         distinct(SUBJECTKEY, .keep_all = TRUE)   ,
                           split_test = left_join(test_site_split[[.]],
                                                  CFA_list_site[[.]][[1]][["testing_baseline"]],
                                                  by="SUBJECTKEY")%>% 
                         distinct(SUBJECTKEY, .keep_all = TRUE) ,
                       measure_one = "par",
                       measure_two = "Within"))

 names(rsmri_site_data)  <- all_sample %>% 
   map (.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 
 
 rsmri_site <- all_site %>% 
   map(.,~enet_tuning_site(resp_var =resp_chosen,
                           data_list = rsmri_site_data[[.]]))

names(rsmri_site) <- all_sample %>% 
  map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 

return(rsmri_site)
}


DTI_cross_site_CFA <- function(resp_chosen){
#  doParallel::registerDoParallel()
### map over all sites instead of one single response variable
DTI_site_data <-  all_site %>% 
  map(.,~enet_site_comOne(resp_var = resp_chosen,
                          split_train =
                            left_join(analysis(train_valid_split[[.]]$splits[[1]]),
                                      CFA_list_site[[.]][[1]][["enet_tuning"]],
                                      by="SUBJECTKEY")%>% 
                            distinct(SUBJECTKEY, .keep_all = TRUE)  ,
                           split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
                                                 CFA_list_site[[.]][[1]][["RanFor_tuning"]],
                                                 by="SUBJECTKEY")%>% 
                            distinct(SUBJECTKEY, .keep_all = TRUE)   ,
                           split_test = left_join(test_site_split[[.]],
                                                  CFA_list_site[[.]][[1]][["testing_baseline"]],
                                                  by="SUBJECTKEY")%>% 
                            distinct(SUBJECTKEY, .keep_all = TRUE) ,
                          measure_one = "FA_")) 
names(DTI_site_data)  <- all_sample %>%
  map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 

DTI_site <-  all_site %>% 
  map(.,~enet_tuning_site(resp_var =resp_chosen,
                          data_list = DTI_site_data[[.]]))
  
 names(DTI_site)<- all_sample %>% map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) 

return(DTI_site)
}

4.3.2 Acutal enet parameter tuning for G-Factor across sites

Nback_enet_cross_site_CFA <- CFA_resp_site %>% map(.,~Nback_cross_site_CFA(resp_chosen =.))
SST_enet_cross_site_CFA <- CFA_resp_site %>% map(.,~SST_cross_site_CFA(resp_chosen =.))
MID_enet_cross_site_CFA <- CFA_resp_site %>% map(.,~MID_cross_site_CFA(resp_chosen =.))
smri_enet_cross_site_CFA <- CFA_resp_site %>% map(.,~smri_cross_site_CFA(resp_chosen =.))
rsmri_enet_cross_site_CFA <- CFA_resp_site%>% map(.,~rsmri_cross_site_CFA(resp_chosen =.))
DTI_enet_cross_site_CFA <- CFA_resp_site %>% map(.,~DTI_cross_site_CFA(resp_chosen =.))


rf_enet_cross_site_CFA <- list("Nback_site"=Nback_enet_cross_site_CFA,
                           "SST_site"=SST_enet_cross_site_CFA,
                           "MID_site"=MID_enet_cross_site_CFA,
                           "smri_site"=smri_enet_cross_site_CFA,
                           "rsmri_site"=rsmri_enet_cross_site_CFA,
                           "DTI_site"=DTI_enet_cross_site_CFA)

4.3.3 Actual random forest model fit for G-Factor across sites

all_cores <- parallel::detectCores(logical = FALSE) - 2


mri_site <- paste0(mri_vec,"_site")
for(i in 1:length(mri_site)){
  names(rf_enet_cross_site_CFA[[mri_site[i]]])<- CFA_resp_site
}

random_forest_validsite_CFA <- CFA_resp_site %>% map(.,function(resp_idx=.){
random_forest_onesite <-   all_site %>%map(.,
                                           ~rnd_for_data_site(pred_select = "predicted",
                                                              data_list=rf_enet_cross_site_CFA,site_idx = .,
                                                              resp_chose=resp_idx,
                                                              resp_data=CFA_list_site[[.]][[1]][["RanFor_tuning"]]) ) 
names(random_forest_onesite) <- all_site
return(random_forest_onesite)
})

## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(resp_chose)` instead of `resp_chose` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

names(random_forest_validsite_CFA) <- CFA_resp_site

random_forest_testsite_CFA <- CFA_resp_site %>% map(.,function(resp_idx=.){
random_forest_onesite <-   all_site %>%
  map(.,~rnd_for_data_site(pred_select = "predicted_test",
                           data_list=rf_enet_cross_site_CFA,
                           site_idx = .,
                           resp_chose=resp_idx,
                           resp_data=CFA_list_site[[.]][[1]][["testing_baseline"]])) 
names(random_forest_onesite) <- all_site
return(random_forest_onesite)
})
names(random_forest_testsite_CFA) <- CFA_resp_site

##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)
### this computation takes more than 10 cores and around 50 gb ram

random_forest_crosssite_CFA <- CFA_resp_site %>% future_map(.,function(resp_idx=.){
random_forest_oneresp  <-all_site %>% future_map(.,~random_forest_site(resp = resp_idx,
                                                                valid_list = random_forest_validsite_CFA,
                                                                test_list = random_forest_testsite_CFA,site_chose = .),
             .options = furrr_options(seed=TRUE))
names(random_forest_oneresp) <- all_site

return(random_forest_oneresp)
  },.options = furrr_options(seed=TRUE))

4.3.4 extracting cross site performance for G-Factor

### enet
modality_site_CFA <-names(rf_enet_cross_site_CFA) %>% str_remove_all("_site")
names(rf_enet_cross_site_CFA)<- modality_site_CFA

cross_site_enet_performance_CFA = function(resp_vec,data_input,modality_input,target_data){
  data_split <- data_input[[modality_input]]
  one_modality_extract <- resp_vec %>% 
    
    map(.,function(resp_input=.){
      performance_extract <- all_site %>%
        
        map(.,function(site_idx){
          performance_site <- 
            data_split[[resp_input]][[site_idx]][["param_tune"]][["test_performance"]]
          mae_frame <- left_join(data_split[[resp_input]][[site_idx]][["predicted_test"]],
                                 target_data[[site_idx]][[1]][["testing_baseline"]], 
                                 by="SUBJECTKEY")%>% 
            select("pred",all_of(resp_input) ,"SUBJECTKEY") %>%
            distinct(SUBJECTKEY, .keep_all = TRUE)%>% drop_na()
           mae_frame  <- mae_frame %>% mutate(scaled_target = as.vector(scale(mae_frame[[resp_input]])) )
          ## the data has to be referenced like this to be put into this function
          mae_perform <- yardstick::mae(data = mae_frame,
                                        truth=scaled_target,
                                        estimate=pred)
          rmse_perform <- yardstick::rmse(data = mae_frame,
                                          truth=scaled_target,
                                          estimate=pred)
          tradRsq_perform <- yardstick::rsq_trad(data = mae_frame,
                                                 truth=scaled_target,
                                                 estimate=pred)  
          
          combined_results <- tibble(performance = as.numeric(performance_site),
                                     site=site_idx,
                                     modality=modality_input,
                                     response=resp_input,
                                     mae_performance=mae_perform$.estimate,
                                     rmse_performance=rmse_perform$.estimate,
                                     tradRsq_performance=tradRsq_perform$.estimate
                                     ) 
          return(combined_results)
        })%>% 
        bind_rows()
    })
}

performance_enet_cross_site_CFA <- modality_vec %>% 
  map(.,~cross_site_enet_performance_CFA(resp_vec = CFA_resp_site,
                                     data_input = rf_enet_cross_site_CFA,
                                     modality_input = .,target_data =CFA_list_site))
names(performance_enet_cross_site_CFA) <- modality_vec

for(i in 1: length(modality_vec)){
  names(performance_enet_cross_site_CFA[[modality_vec[i]]])<- CFA_resp_site
}

response_enet_cross_site_CFA <- CFA_resp_site %>% 
  map(.,function(resp_input=.,
                 data_input=performance_enet_cross_site_CFA){
  one_resp_performance <-  modality_site_CFA %>% 
    map(.,function(modality_input=.){
    one_resp_modality <- data_input[[modality_input]][[resp_input]]
    return(one_resp_modality)
  })%>% bind_rows()
  
 one_resp_performance_test<- one_resp_performance 
 
return(one_resp_performance_test)
})
names(response_enet_cross_site_CFA) <- CFA_resp_site

### random forest

names(random_forest_crosssite_CFA)<- CFA_resp_site

cross_site_rf_performance_CFA = function(resp_idx,data_input=random_forest_crosssite,
                                     plotting_frame=resp_var_plotting,target_data=test_site_split){
  site_performance <- all_site %>% map(.,function(site_idx){
    one_site <- data_input[[resp_idx]][[site_idx]][["param_tune"]][["performance"]]
    
    mae_frame <- left_join(data_input[[resp_idx]][[site_idx]][["predicted"]],
                           target_data[[site_idx]][[1]][["testing_baseline"]], 
                           by="SUBJECTKEY")%>% select("pred",all_of(resp_idx) ,"SUBJECTKEY") %>%
      distinct(SUBJECTKEY, .keep_all = TRUE)%>% drop_na()
     mae_frame  <- mae_frame %>% mutate(scaled_target = as.vector(scale(mae_frame[[resp_idx]])) )
    ## the data has to be referenced like this to be put into this function
    mae_perform <- yardstick::mae(data = mae_frame,truth=scaled_target,estimate=pred)  
    rmse_perform <- yardstick::rmse(data = mae_frame,truth=scaled_target,estimate=pred)  
    tradRsq_perform <- yardstick::rsq_trad(data = mae_frame,truth=scaled_target,estimate=pred)  

    combined_results <- tibble(performance = as.numeric(one_site),
                               site=site_idx,
                               modality="random_forest",
                               response=resp_idx,
                               longer_name = plotting_frame$longer_name[which(plotting_frame$response==response)],
                               shorter_name = plotting_frame$short_name[which(plotting_frame$response==response[1])],
                               mae_performance=mae_perform$.estimate,
                               rmse_performance=rmse_perform$.estimate,
                               tradRsq_performance=tradRsq_perform$.estimate)
    return(combined_results)
  })%>% bind_rows() 
}


performance_rf_cross_site_CFA <- CFA_resp_site %>% 
  map(.,~cross_site_rf_performance_CFA(resp_idx=.,
                                       data_input = random_forest_crosssite_CFA,
                                       plotting_frame = CFA_var_plotting,
                                       target_data =CFA_list_site))
names(performance_rf_cross_site_CFA)<- CFA_resp_site
performance_all_modalities_CFA <- CFA_resp_site %>% 
  map(.,~bind_rows(performance_rf_cross_site_CFA[[.]],
                   response_enet_cross_site_CFA[[.]])%>%
        mutate(site_num=str_remove(.[['site']],"site") ) ) 
names(performance_all_modalities_CFA)<- CFA_resp_site
#modality_ploting <- c("Stacked","NBack","MID", "SST","rs_fMRI","sMRI","DTI")
performance_all_modalities_CFA <- performance_all_modalities_CFA %>% 
  map(.,~change_modality_names(performance_frame = .))
  
   
CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]], 
                            aes(site_num, performance, 
                                col = modality))+
  geom_point(key_glyph = "point", size = 3)+
    labs(x = 'Site', 
         y=NULL, 
         col = 'Model',
          title  = paste0("Out-of-site Predictive Ability (Pearson's r)\n",
                       CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]]))+
    theme(axis.text.x=element_text(size=16, angle = 45),
          axis.text.y=element_text(size=16),
          axis.title.y=element_text(size=18),
          axis.title.x=element_text(size=18),
          plot.title=element_text(size=20),
          legend.position = "right",
          legend.title=element_blank(),
          legend.text = element_text( size=18)))

[[1]]

## Warning: Removed 3 rows containing missing values.

CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]], 
                            aes(site_num, performance_all_modalities_CFA [[.]]$mae_performance, 
                                col = modality))+
  geom_point(key_glyph = "point", size = 3)+
    labs(x = 'Site', 
         y=NULL, 
         col = 'Model',
          title  = paste0("Out-of-site Predictive Ability (MAE)\n",
                       CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]]))+
    theme(axis.text.x=element_text(size=16, angle = 45),
          axis.text.y=element_text(size=16),
          axis.title.y=element_text(size=18),
          axis.title.x=element_text(size=18),
          plot.title=element_text(size=20),
          legend.position = "right",
          legend.title=element_blank(),
          legend.text = element_text( size=18)))

[[1]]

## Warning: Removed 3 rows containing missing values.

CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]], 
                            aes(site_num, performance_all_modalities_CFA [[.]]$rmse_performance, 
                                col = modality))+
  geom_point(key_glyph = "point", size = 3)+
    labs(x = 'Site', 
         y=NULL, 
         col = 'Model',
          title  = paste0("Out-of-site Predictive Ability (RMSE)\n",
                       CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]]))+
    theme(axis.text.x=element_text(size=16, angle = 45),
          axis.text.y=element_text(size=16),
          axis.title.y=element_text(size=18),
          axis.title.x=element_text(size=18),
          plot.title=element_text(size=20),
          legend.position = "right",
          legend.title=element_blank(),
          legend.text = element_text( size=18)))

[[1]]

## Warning: Removed 3 rows containing missing values.

CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]], 
                            aes(site_num, performance_all_modalities_CFA [[.]]$tradRsq_performance, 
                                col = modality))+
  geom_point(key_glyph = "point", size = 3)+
    labs(x = 'Site', 
         y=NULL, 
         col = 'Model',
          title  = paste0("Out-of-site Predictive Ability (R squared)\n",
                       CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]]))+
    theme(axis.text.x=element_text(size=16, angle = 45),
          axis.text.y=element_text(size=16),
          axis.title.y=element_text(size=18),
          axis.title.x=element_text(size=18),
          plot.title=element_text(size=20),
          legend.position = "right",
          legend.title=element_blank(),
          legend.text = element_text( size=18)))

[[1]]

## Warning: Removed 3 rows containing missing values.

### getting summary statistics 
performance_all_modalities_CFA <- performance_all_modalities_CFA %>% map(.,~mutate(.,Pearson_r =performance)%>%
                                                                           mutate(.,R_squared = tradRsq_performance)%>%
                                                                           mutate(.,MAE=mae_performance)%>%
                                                                           mutate(.,RMSE=rmse_performance))



performance_all_modalities_CFA %>% map(.,~group_by(.,modality)%>%
                                     summarise_at(c("Pearson_r","R_squared","MAE","RMSE"),
                                                  mean,na.rm=TRUE)%>%
                                       mutate_at(vars(c("Pearson_r","R_squared","MAE","RMSE")),funs(round(.,3)))%>% 
                                     kableExtra::kbl(caption = paste0("Mean of all modalities cross sites ",
                                                                      paste0("\n",resp_var_plotting$longer_name[which(resp_var_plotting$response==.)]))) %>%
                                     kableExtra::kable_classic(full_width = F, html_font = "Cambria") 
)

$g_second_order

Mean of all modalities cross sites
modality	Pearson_r	R_squared	MAE	RMSE
Stacked	0.461	0.211	0.699	0.887
NBack	0.410	0.168	0.718	0.910
MID	0.223	0.049	0.774	0.973
SST	0.136	0.017	0.784	0.989
rs_fMRI	0.261	0.068	0.763	0.964
sMRI	0.246	0.062	0.763	0.967
DTI	0.220	0.047	0.767	0.975

performance_all_modalities_CFA %>% map(.,~group_by(.,modality)%>%
                                     summarise_at(c("Pearson_r","R_squared","MAE","RMSE"),
                                                  sd,na.rm=TRUE)%>%
                                       mutate_at(vars(c("Pearson_r","R_squared","MAE","RMSE")),funs(round(.,3)))%>% 
                                     kableExtra::kbl(caption = paste0("Standard deviation of all modalities cross sites ",
                                                                      paste0("\n",resp_var_plotting$longer_name[which(resp_var_plotting$response==.)]))) %>%
                                     kableExtra::kable_classic(full_width = F, html_font = "Cambria") 
)

$g_second_order

Standard deviation of all modalities cross sites
modality	Pearson_r	R_squared	MAE	RMSE
Stacked	0.055	0.050	0.023	0.028
NBack	0.063	0.051	0.027	0.029
MID	0.093	0.044	0.019	0.023
SST	0.066	0.020	0.012	0.010
rs_fMRI	0.058	0.029	0.015	0.015
sMRI	0.088	0.043	0.022	0.023
DTI	0.071	0.035	0.017	0.018

4.3.5 Plotting cross site performance for G-Factor within one plot

corr_plot <- CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]], 
                            aes(site_num, performance, 
                                col = modality))+
  geom_point(key_glyph = "point", size = 3)+
    labs(x = NULL, 
         y=NULL, 
         col = 'Model',
          title  = paste0("Pearson's r"))+
    theme(axis.text.x=element_text(size=14, angle = 45),
          axis.text.y=element_text(size=14),
          axis.title.y=element_text(size=16),
          axis.title.x=element_text(size=16),
          plot.title=element_text(size=18),
          legend.position = "right",
          legend.title=element_blank(),
          legend.text = element_text( size=18)))   

mae_plot <- CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]], 
                            aes(site_num, performance_all_modalities_CFA [[.]]$mae_performance, 
                                col = modality))+
  geom_point(key_glyph = "point", size = 3)+
    labs(x = NULL, 
         y=NULL, 
         col = 'Model',
          title  = paste0("MAE"))+
    theme(axis.text.x=element_text(size=14, angle = 45),
          axis.text.y=element_text(size=14),
          axis.title.y=element_text(size=16),
          axis.title.x=element_text(size=16),
          plot.title=element_text(size=18),
          legend.position = "right",
          legend.title=element_blank(),
          legend.text = element_text( size=18)))   

rmse_plot <- CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]], 
                            aes(site_num, performance_all_modalities_CFA [[.]]$rmse_performance, 
                                col = modality))+
  geom_point(key_glyph = "point", size = 3)+
    labs(x = NULL, 
         y=NULL, 
         col = 'Model',
          title  = paste0("RMSE"))+
    theme(axis.text.x=element_text(size=14, angle = 45),
          axis.text.y=element_text(size=14),
          axis.title.y=element_text(size=16),
          axis.title.x=element_text(size=16),
          plot.title=element_text(size=18),
          legend.position = "right",
          legend.title=element_blank(),
          legend.text = element_text( size=18)))   

rsq_plot <- CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]], 
                            aes(site_num, performance_all_modalities_CFA [[.]]$tradRsq_performance, 
                                col = modality))+
  geom_point(key_glyph = "point", size = 3)+
    labs(x = NULL, 
         y=NULL, 
         col = 'Model',
          title  = paste0("R squared"))+
    theme(axis.text.x=element_text(size=14, angle = 45),
          axis.text.y=element_text(size=14),
          axis.title.y=element_text(size=16),
          axis.title.x=element_text(size=16),
          plot.title=element_text(size=18),
          legend.position = "right",
          legend.title=element_blank(),
          legend.text = element_text( size=18)))   

title_combined_plot <- ggdraw() + 
  draw_label(
    "Out-of-site Predictive Ability",
    fontface = 'bold',
    x = 0,
    hjust = 0,
    size=24
  ) +
  theme(
    # add margin on the left of the drawing canvas,
    # so title is aligned with left edge of first plot
    plot.margin = margin(0, 0, 0, 5)
  )
plot_combined_list <- list (corr_plot= corr_plot[[1]], 
                            rsq_plot = rsq_plot[[1]], 
                            mae_plot = mae_plot[[1]],
                            rmse_plot=rmse_plot[[1]])
combined_plot <- ggpubr::ggarrange(title_combined_plot,
                                   ggpubr::ggarrange(plotlist=plot_combined_list, 
                                                     common.legend = TRUE,
                                                     legend = "right"),
                                   nrow = 2 , heights = c(0.1, 1))

## Warning: Removed 3 rows containing missing values.
## Removed 3 rows containing missing values.
## Removed 3 rows containing missing values.
## Removed 3 rows containing missing values.
## Removed 3 rows containing missing values.

ggpubr::annotate_figure(combined_plot,bottom = ggpubr::text_grob("Site",size=15))

5 R session and library

pander::pander(sessionInfo())

R version 4.1.3 (2022-03-10)

Platform: x86_64-w64-mingw32/x64 (64-bit)

locale: LC_COLLATE=English_New Zealand.1252, LC_CTYPE=English_New Zealand.1252, LC_MONETARY=English_New Zealand.1252, LC_NUMERIC=C and LC_TIME=English_New Zealand.1252

attached base packages: stats4, grid, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: fastshap(v.0.0.7), ggsegJHU(v.1.0.1.001), ggsegDesterieux(v.1.0.1.002), ggsegExtra(v.1.5.33.004), ggseg3d(v.1.6.3), ggseg(v.1.6.4), vip(v.0.3.2), sva(v.3.42.0), BiocParallel(v.1.28.3), genefilter(v.1.76.0), mgcv(v.1.8-39), nlme(v.3.1-155), corpcor(v.1.6.10), permimp(v.1.0-2), party(v.1.3-10), strucchange(v.1.5-2), sandwich(v.3.0-1), zoo(v.1.8-10), modeltools(v.0.2-23), mvtnorm(v.1.1-3), furrr(v.0.3.0), future(v.1.25.0), plotly(v.4.10.0), cowplot(v.1.1.1), yardstick(v.0.0.9), workflowsets(v.0.2.1), workflows(v.0.2.6), tune(v.0.2.0), rsample(v.0.1.1), recipes(v.0.2.0), parsnip(v.0.2.1), modeldata(v.0.1.1), infer(v.1.0.0), dials(v.0.1.1), scales(v.1.2.0), tidymodels(v.0.2.0), pander(v.0.6.4), olsrr(v.0.5.3), glmnet(v.4.1-4), Matrix(v.1.4-0), eNetXplorer(v.1.1.3), broom(v.0.8.0), forcats(v.0.5.1), stringr(v.1.4.0), dplyr(v.1.0.8), purrr(v.0.3.4), readr(v.2.1.2), tidyr(v.1.2.0), tibble(v.3.1.6), ggplot2(v.3.3.6) and tidyverse(v.1.3.1)

loaded via a namespace (and not attached): rgl(v.0.108.3), Hmisc(v.4.6-0), svglite(v.2.1.0), class(v.7.3-20), foreach(v.1.5.2), crayon(v.1.5.1), MASS(v.7.3-55), backports(v.1.4.1), reprex(v.2.0.1), rlang(v.1.0.2), XVector(v.0.34.0), readxl(v.1.4.0), extrafontdb(v.1.0), nloptr(v.2.0.0), limma(v.3.50.3), naniar(v.0.6.1), extrafont(v.0.18), bit64(v.4.0.5), glue(v.1.6.2), parallel(v.4.1.3), oro.nifti(v.0.11.0), AnnotationDbi(v.1.56.2), BiocGenerics(v.0.40.0), classInt(v.0.4-3), haven(v.2.5.0), tidyselect(v.1.1.2), RRPP(v.1.2.3), XML(v.3.99-0.9), calibrate(v.1.7.7), sf(v.1.0-7), ggpubr(v.0.4.0), SuppDists(v.1.1-9.7), xtable(v.1.8-4), magrittr(v.2.0.3), evaluate(v.0.15), cli(v.3.2.0), zlibbioc(v.1.40.0), rstudioapi(v.0.13), sp(v.1.4-7), DiceDesign(v.1.9), bslib(v.0.3.1), rpart(v.4.1.16), pbmcapply(v.1.5.1), numform(v.0.7.0), xfun(v.0.30), cluster(v.2.1.2), caTools(v.1.18.2), KEGGREST(v.1.34.0), RNifti(v.1.4.0), expm(v.0.999-6), ape(v.5.6-2), listenv(v.0.8.0), reshape(v.0.8.9), Biostrings(v.2.62.0), png(v.0.1-7), ipred(v.0.9-12), withr(v.2.5.0), ggforce(v.0.3.3), neurobase(v.1.32.1), bitops(v.1.0-7), freesurfer(v.1.6.8), ranger(v.0.13.1), plyr(v.1.8.6), cellranger(v.1.1.0), hardhat(v.0.2.0), e1071(v.1.7-9), pROC(v.1.18.0), coda(v.0.19-4), pillar(v.1.7.0), RcppParallel(v.5.1.5), gplots(v.3.1.3), cachem(v.1.0.6), multcomp(v.1.4-19), fs(v.1.5.2), raster(v.3.5-15), geomorph(v.4.0.3), vctrs(v.0.4.0), pbivnorm(v.0.6.0), ellipsis(v.0.3.2), generics(v.0.1.2), nortest(v.1.0-4), lava(v.1.6.10), rgdal(v.1.5-31), tools(v.4.1.3), foreign(v.0.8-82), tweenr(v.1.0.2), munsell(v.0.5.0), emmeans(v.1.7.3), proxy(v.0.4-26), fastmap(v.1.1.0), compiler(v.4.1.3), abind(v.1.4-5), stars(v.0.5-5), DescTools(v.0.99.45), semPlot(v.1.1.5), GenomeInfoDbData(v.1.2.7), prodlim(v.2019.11.13), gridExtra(v.2.3), OpenMx(v.2.20.6), edgeR(v.3.36.0), lattice(v.0.20-45), utf8(v.1.2.2), jsonlite(v.1.8.0), arm(v.1.12-2), GGally(v.2.1.2), gld(v.2.6.4), pbapply(v.1.5-0), carData(v.3.0-5), estimability(v.1.3), lazyeval(v.0.2.2), car(v.3.0-13), R.utils(v.2.11.0), latticeExtra(v.0.6-29), goftest(v.1.2-3), checkmate(v.2.0.0), rmarkdown(v.2.14), openxlsx(v.4.2.5), webshot(v.0.5.3), Biobase(v.2.54.0), igraph(v.1.2.11), survival(v.3.2-13), numDeriv(v.2016.8-1.1), yaml(v.2.3.5), timeROC(v.0.4), systemfonts(v.1.0.4), survivalROC(v.1.0.3), htmltools(v.0.5.2), memoise(v.2.0.1), lavaan(v.0.6-11), locfit(v.1.5-9.5), IRanges(v.2.28.0), viridisLite(v.0.4.0), digest(v.0.6.29), assertthat(v.0.2.1), timereg(v.2.0.2), lwgeom(v.0.2-8), Rttf2pt1(v.1.3.10), units(v.0.8-0), RSQLite(v.2.2.13), future.apply(v.1.9.0), rockchalk(v.1.8.151), Exact(v.3.1), data.table(v.1.14.2), blob(v.1.2.3), R.oo(v.1.24.0), S4Vectors(v.0.32.4), lhs(v.1.1.5), splines(v.4.1.3), Formula(v.1.2-4), labeling(v.0.4.2), RCurl(v.1.98-1.6), pec(v.2022.03.06), hms(v.1.1.1), modelr(v.0.1.8), colorspace(v.2.0-3), base64enc(v.0.1-3), mnormt(v.2.0.2), survcomp(v.1.44.1), shape(v.1.4.6), tmvnsim(v.1.0-2), libcoin(v.1.0-9), nnet(v.7.3-17), sass(v.0.4.1), Rcpp(v.1.0.8), coin(v.1.4-2), GPfit(v.1.0-8), fansi(v.1.0.3), tzdb(v.0.3.0), parallelly(v.1.31.1), R6(v.2.5.1), lifecycle(v.1.0.1), rootSolve(v.1.8.2.3), zip(v.2.2.0), semTools(v.0.5-6), ggsignif(v.0.6.3), minqa(v.1.2.4), mi(v.1.0), jquerylib(v.0.1.4), qgraph(v.1.9.2), glasso(v.1.11), TH.data(v.1.1-1), RColorBrewer(v.1.1-3), iterators(v.1.0.14), gower(v.1.0.0), htmlwidgets(v.1.5.4), polyclip(v.1.10-0), GPArotation(v.2022.4-1), terra(v.1.5-21), rvest(v.1.0.2), globals(v.0.14.0), lmom(v.2.8), htmlTable(v.2.4.0), codetools(v.0.2-18), matrixStats(v.0.62.0), lubridate(v.1.8.0), randomForest(v.4.7-1), gtools(v.3.9.2), prettyunits(v.1.1.1), psych(v.2.2.3), dbplyr(v.2.1.1), R.methodsS3(v.1.8.1), GenomeInfoDb(v.1.30.1), gtable(v.0.3.0), DBI(v.1.1.2), visdat(v.0.5.3), smoothr(v.0.2.2), httr(v.1.4.3), highr(v.0.9), KernSmooth(v.2.23-20), vroom(v.1.5.7), stringi(v.1.7.6), progress(v.1.2.2), reshape2(v.1.4.4), farver(v.2.1.0), annotate(v.1.72.0), fdrtool(v.1.2.17), magick(v.2.7.3), timeDate(v.3043.102), lisrelToR(v.0.1.4), xml2(v.1.3.3), boot(v.1.3-28), kableExtra(v.1.3.4), rmeta(v.3.0), lme4(v.1.1-28), sem(v.3.1-14), kutils(v.1.70), bit(v.4.0.4), jpeg(v.0.1-9), pkgconfig(v.2.0.3), rstatix(v.0.7.0), bootstrap(v.2019.6) and knitr(v.1.39)

Stacking Models for Cognition in ABCD 3.0

Yue Wang and Narun P

16/May/2022

0.1 Note

0.2 Loading libraries

1 Data Preparation

1.1 Load data files

1.2 load all of the data after quality control

1.3 missing value for DTI

1.4 calculate partial correlation for resting state

1.5 Join all the data sets

1.5.1 filter out the participants that have problems with vision

1.6 set up variable names

1.7 functions for data preparation

1.8 extract desired contrasts

1.9 look at the distribution of the tasks

2 Modeling for genearlisation across times for each cognitive task

2.1 set up baseline and followup participants

2.2 Elastic net parameter tuning setup

2.2.1 set up the data and adjust for batch effects

2.2.2 a function for elastic net hyper parameter tuning and performance evaluation

2.3 model fitting for each cognitive task

2.3.1 actual data preping for elastic net for each cognitive task

2.3.2 actual parameter tuning for elastic net for each cognitive task

2.3.3 getting all the numbers of participants in all the data splits

2.3.4 Fit the tuned enet model to enetXplorer for each cognitive task and plot them

2.3.5 results of enetXplorer for each cognitive task

2.3.6 function to extract output from enetXplorer

2.3.7 Plot results from eNetExplorer with task-fMRI predicting each cognitive task

2.3.8 Plot results from eNetExplorer with sMRI predicting each cognitive task

2.3.9 Plot results from eNetExplorer with rs-fMRI predicting each cognitive task

2.3.10 Plot results from eNetExplorer with DTI predicting each cognitive task

2.4 Combine the predicted values with Random Forest Oppornistic Stacking

2.4.1 Random Forest Data preparation for each cognitive task

2.4.2 random forest tuning function

2.4.3 random forest model fit for each cognitive task

2.4.4 print the tuned parameters and model performace of the random forest model for each cognitve task

2.4.5 Plot non-conditional permutated variable importance of the random forest model

2.4.6 Plot conditional permutated variable importance of the random forest stacking model

2.4.6.1 Plot obverved against predicted value for the Random Forest Stacking model for each task

3 Relationship between MRI and G-factor: Confirmatory factor analysis

3.1 set up CFA

3.1.1 find the number of EFA factors using parallel analysis

3.1.2 fitting Oblimin FA

3.1.3 prep data for the EFA -> CFA model

3.1.4 build EFA -> CFA model using data_train

3.1.5 apply the EFA -> CFA model to data valid, data baseline, data followup

3.1.6 look at the final EFA CFA model

3.1.7 plot the EFA part

3.1.8 plot and provide sum stats of the 1st order model

3.1.9 plot and provide sum stats of the 2nd-order model

3.1.10 reliabity of the G-Factor

3.1.11 plot and provide sum stats of the single factor model

3.1.12 testing the stability of g-factor

3.1.13 correlation in 2nd-order g between those based on first layer train vs. whole baseline

3.1.14 Joining extracted factor scores with the data:

3.1.15 look at the number

3.1.16 Plot distribution of G factor

3.1.17 set CFA variable names for ploting

3.2 model fitting for Confirmatory factor analysis

3.2.1 data preping for elastic net for factor scores

3.2.2 actual parameter tuning for elastic net for factor scores

3.2.3 create table of the number of subjects in each data splits

3.2.4 look at the mean and sd of penalty

3.2.5 Fit the tuned enet model to enetXplorer for the factor scores

3.2.6 results of enetXplorer for each factor score

3.2.7 Plot results from eNetExplorer with task-fMRI predicting each factor score

3.2.8 Plot results from eNetExplorer with sMRI predicting each factor score

3.2.9 Plot results from eNetExplorer with DTI predicting each cognitive task

3.2.10 Plot brain images from eNetExplorer for tasks and sMRI predicting G-Factor

3.2.11 Plot brain images from eNetExplorer for DTI predicting G-Factor

3.3 Combine the predicted values with Random Forest Oppornistic Stacking for factor scores

3.3.1 Random Forest Data preparation for factor scores

3.3.2 random forest model fit for factor scores

3.3.3 print the tuned parameters and model performace of the random forest model for each factor score

3.3.4 Plot conditional permutated variable importance of the random forest stacking model for each factor score

3.3.5 Compute shapley values for the G-Factor

3.3.5.1 shapley computing function

3.3.5.2 extract model fit and training data from the previous codes

3.3.5.3 compute shapley values