## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 508773 27.2 1131714 60.5 643845 34.4
## Vcells 939865 7.2 8388608 64.0 1649066 12.6
Here we the ABCD release 3.0 data-set
The following libraries and default settings were used during the analysis:
options(scipen = 999)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.6 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(broom)
library(eNetXplorer)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-4
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(pander)
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v dials 0.1.1 v rsample 0.1.1
## v infer 1.0.0 v tune 0.2.0
## v modeldata 0.1.1 v workflows 0.2.6
## v parsnip 0.2.1 v workflowsets 0.2.1
## v recipes 0.2.0 v yardstick 0.0.9
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x Matrix::expand() masks tidyr::expand()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x Matrix::pack() masks tidyr::pack()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## x Matrix::unpack() masks tidyr::unpack()
## x recipes::update() masks Matrix::update(), stats::update()
## * Search for functions across packages at https://www.tidymodels.org/find/
library("cowplot")
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:eNetXplorer':
##
## export
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
##parallel map
library("ggplot2")
library("furrr")
## Loading required package: future
library("party", quietly = TRUE)
##
## Attaching package: 'modeltools'
## The following object is masked from 'package:tune':
##
## parameters
## The following object is masked from 'package:parsnip':
##
## fit
## The following object is masked from 'package:infer':
##
## fit
## The following object is masked from 'package:dials':
##
## parameters
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
library("permimp")
theme_set(theme_bw() + theme(panel.grid = element_blank()))
## parallel processing number of cores register
all_cores <- parallel::detectCores(logical = FALSE) - 2
We first loaded all of the relevant data files (not shown here as they refer to local directories):
The aim is to compute the following:
structural mri: cortical thickness and subcortical value
resting state mri: Gordon network correlations
stop signal task (SST): any stop vs correct go contrast
MID (monetary incentive): reward vs neutral contrast
Nback task mri scan: 2 back vs 0 back contrast
DTI scan: FA from Hagler atlas
###loading site and scanner information
MRIinfo <-tibble::as_tibble(read.csv(paste0(datafolder, "ABCD_MRI01_DATA_TABLE.csv")))
Siteinfo <-tibble::as_tibble(read.csv(paste0(datafolder, "ABCD_LT01_DATA_TABLE.csv")))
MriandSite <- left_join(MRIinfo,Siteinfo, by=c('SUBJECTKEY','EVENTNAME'))
MRIinfo %>% count(EVENTNAME,SEX)
## # A tibble: 4 x 3
## EVENTNAME SEX n
## <chr> <chr> <int>
## 1 2_year_follow_up_y_arm_1 F 2617
## 2 2_year_follow_up_y_arm_1 M 3076
## 3 baseline_year_1_arm_1 F 5631
## 4 baseline_year_1_arm_1 M 6161
###loading response variables
NIH_TB <-tibble::as_tibble(read.csv(paste0(datafolder,"ABCD_TBSS01_DATA_TABLE.csv")))
LittleMan <-tibble::as_tibble(read.csv(paste0(datafolder,"LMTP201_DATA_TABLE.csv")))
Pearson <-tibble::as_tibble(read.csv(paste0(datafolder,"ABCD_PS01_DATA_TABLE.csv")))
NBackBeh <- tibble::as_tibble(read.csv(paste0(datafolder, "ABCD_MRINBACK02_DATA_TABLE.csv")) )
###loading brain scan data
smri_data <- tibble::as_tibble(read.csv( paste0(datafold,"sMRIDestAsegReadableGgsegQCedWeighted.csv")))
Nback_data <- tibble::as_tibble(read.csv(paste0(datafold, "NBackDestAsegReadableGgseg3dQCed.csv")))
MID_data <- tibble::as_tibble(read.csv(paste0(datafold, "MIDDestAsegReadableGgseg3dQCed.csv")))
SST_data <- tibble::as_tibble(read.csv(paste0(datafold, "SSTDestAsegReadableGgseg3d_QCed.csv")))
rsmri_data <- tibble::as_tibble(read.csv(paste0(datafold, "RSBetNetExcludedNoCOMBATABCD3.csv")))
DTI_data <- tibble::as_tibble(read.csv(paste0(datafold, "DTA_FA23Tracks.csv")))
### Note that we have not removed NA before Enet tuning to keep the most number of participants
short_names <- tibble::as_tibble(read.csv(paste0(scriptfold,"ShortNames_all.csv") ))
### load vision scan data, according to the documents some of the participants cannot see the IPad screen
vision_idx <- tibble::as_tibble(read.table(paste0(NDAfold, "abcd_svs01.txt"),header = TRUE))
site 01, 17, 19 have no DTI
MriandSite %>% select(SUBJECTKEY,EVENTNAME, SITE_ID_L) %>%
full_join(DTI_data, by = c('SUBJECTKEY','EVENTNAME')) %>%
select(FA_R_fornix, SITE_ID_L) %>%
group_by(SITE_ID_L) %>%
naniar::miss_var_summary() %>%
arrange(desc(pct_miss))
## # A tibble: 23 x 4
## # Groups: SITE_ID_L [23]
## SITE_ID_L variable n_miss pct_miss
## <chr> <chr> <int> <dbl>
## 1 site17 FA_R_fornix 890 100
## 2 site19 FA_R_fornix 840 100
## 3 site01 FA_R_fornix 562 99.8
## 4 site08 FA_R_fornix 113 22.6
## 5 site13 FA_R_fornix 233 21.7
## 6 site22 FA_R_fornix 7 20.6
## 7 site10 FA_R_fornix 203 17.7
## 8 site04 FA_R_fornix 209 17.4
## 9 site18 FA_R_fornix 85 15.3
## 10 site15 FA_R_fornix 100 15.2
## # ... with 13 more rows
We 1) kept the within-network z-transformed zero correlations 2) took the means of between-network z-transformed zero correlations 3) converted these Fisher z to r and rearranged them into a correlation matrix 4) transformed this matrix to “partial” correlations with cor2pcor 5) converted them back to Fisher z
library("corpcor")
ROI_names <- c("Auditory","CinguloOpercular","CinguloParietal","Default","DorsalAttention",
"FrontoParietal","None","RetrosplenialTemporal","SensorimotorHand","SensorimotorMouth",
"Salience","VentralAttention")
stepwise_sum <- function(i){
return(sum(diff_seq[1:i]))
}
diff_seq = seq(from=11, to=1, by=-1)
diff_seq2=seq(from=11,to=2)+1
dim_vector <- dim(select(rsmri_data, starts_with("avg")))[2]
dim_matrix <- (-(-1)+sqrt(1-4*1*2*(-dim_vector)))/2
step_seq <- map_dbl(.x=seq(from=1, to=11,by =1),.f=stepwise_sum)
upp_seq<- append(12,step_seq+12)
low_seq <- append(1,map_dbl(.x=seq(from=1,to=10, by=1),.f=function(.x){
return(sum(diff_seq2[1:.x])+1)
}))%>%append(.,78)
partialcor <- function(data_split){
cor_RS <- select(data_split, starts_with("avg"))
cor_RS<-DescTools::FisherZInv(cor_RS[,order(names(cor_RS))])
avg_cor_array <- array(rep(NA,dim_matrix^2*dim(cor_RS)[1]),
dim = c(dim_matrix,dim_matrix,dim(cor_RS)[1]))
Par_cor_array <- array(rep(NA,dim_matrix^2*dim(cor_RS)[1]),
dim = c(dim_matrix,dim_matrix,dim(cor_RS)[1]))
row_to_matrix <- function(j){
target=matrix(rep(1,dim_matrix^2),nrow=dim_matrix)
for(i in 1:length(low_seq)){
target[(i+1):13,i]=target[i,(i+1):13]=as.numeric(cor_RS[j,low_seq[i]:upp_seq[i]])
}
target[12,13]=target[13,12]=as.numeric(cor_RS[j,upp_seq[i]])
return(target)
}
avg_cor_array <-map(.x=seq(from=1,to=dim(cor_RS)[1],by=1),.f=row_to_matrix)
Par_cor_array <- avg_cor_array %>% map(.,cor2pcor)
matrix_to_row <- function(j){
target_vec<-rep(NA,dim(cor_RS)[2])
for(i in 1:length(upp_seq)){
target_vec[low_seq[i]:upp_seq[i]]<-as.vector(Par_cor_array[[j]][i,(i+1):13])
}
return(DescTools::FisherZ(target_vec))
}
Par_cor_RS_list <- map(.x=seq(from=1,to=dim(cor_RS)[1],by=1),
.f=matrix_to_row)
Par_cor_RS_df <- as.data.frame(do.call(rbind,Par_cor_RS_list))
colnames(Par_cor_RS_df)<-names(cor_RS)
Par_cor_RS_df <- Par_cor_RS_df%>%
rename_at(.vars = vars(contains("avg")),
.funs = funs(sub("avg", "par", .)))%>%
mutate(SUBJECTKEY =data_split$SUBJECTKEY,
EVENTNAME=data_split$EVENTNAME)
data_return <- left_join(Par_cor_RS_df,
data_split,
by = c("SUBJECTKEY","EVENTNAME"))%>%
drop_na()
return(data_return)
}
rsmri_par_data <- partialcor(data_split = rsmri_data)
## Registered S3 method overwritten by 'DescTools':
## method from
## reorder.factor gplots
vision_idx <- vision_idx[-1,]%>% mutate(SUBJECTKEY=subjectkey)%>%
mutate(EVENTNAME=eventname)
data_all <- plyr::join_all(list(smri_data,
MID_data,
rsmri_par_data,
Nback_data,
SST_data,DTI_data,
NBackBeh,
Pearson,
LittleMan,
NIH_TB,
MriandSite,
vision_idx),
by=c('SUBJECTKEY','EVENTNAME'), type='full')
### full join all the data sets to keep all the participants after QC. Actually smri have the most number of participants
### there are only two participants that are not in the smri data set but in other modalities.
### change integer response variables into double for later scaling and IQR
data_all$PEA_RAVLT_LD_TRIAL_VII_TC <- as.double(data_all$PEA_RAVLT_LD_TRIAL_VII_TC)
data_all$PEA_WISCV_TRS <- as.double(data_all$PEA_WISCV_TRS)
54 subjects
data_removed <- data_all %>% filter(snellen_va_y == 0 | snellen_va_y == 1 | vis_flg == 2)
removed_subj <- data_removed$SUBJECTKEY
data_all <- data_all %>% filter(!SUBJECTKEY %in% removed_subj)
length(removed_subj)
## [1] 54
mri_vec <- c("Nback","SST","MID","rsmri","smri","DTI")
Resp_Var <- c('TFMRI_NB_ALL_BEH_C2B_RATE',
"NIHTBX_PICVOCAB_UNCORRECTED",
"NIHTBX_FLANKER_UNCORRECTED",
"NIHTBX_PATTERN_UNCORRECTED",
"NIHTBX_PICTURE_UNCORRECTED",
"NIHTBX_READING_UNCORRECTED",
"LMT_SCR_PERC_CORRECT",
"PEA_RAVLT_LD_TRIAL_VII_TC",
"TFMRI_SST_ALL_BEH_TOTAL_ISSRT" )
resp_var_plotting_long <- c("2-back working memory",
"Picture vocabulary",
"Flanker",
"Pattern comparison processing speed",
"Picture sequence memory",
"Oral reading recognition",
"Little man task correct percentage",
"RAVLT long delay trial VII total correct",
"Stop Signal Reaction Time (integration)"
)
resp_var_plotting_short <- c("2-back Work Mem",
"Pic Vocab","Flanker",
"Pattern Speed",
"Seq Memory",
"Reading Recog",
"Little Man",
"Audi Verbal",
"integration SSRT")
resp_var_plotting <- tibble("response" =Resp_Var,
"longer_name"=resp_var_plotting_long,
"short_name"=resp_var_plotting_short)
subj_info <- c('SUBJECTKEY', 'MRI_INFO_DEVICESERIALNUMBER', 'SITE_ID_L')
resp_names <- data_all %>% select(all_of(Resp_Var))%>%
names()%>%
set_names()
new_shorter_names <- short_names
new_shorter_names$roiShort[97] <- "R Subcentral"
new_shorter_names$roiShort <- str_squish(string = new_shorter_names$roiShort)
IQR and combat (to adjust batch effects for sMRI, rsfMRI and DTI)
# IQR function to remove data with 3 IQR rule
IQR_remove <- function(data_split){
data_split%>%
mutate_at(vars(- all_of(subj_info)), ~ ifelse(
.x > quantile(.x, na.rm = TRUE)[4] + 3 * IQR(.x, na.rm = TRUE) |
.x < quantile(.x, na.rm = TRUE)[2] - 3 * IQR(.x, na.rm = TRUE),
NA, .x)) %>%
drop_na() %>%
mutate_if(is.double, ~ (.x - mean(.x)) / sd(.x))
}
library("sva")
## Loading required package: mgcv
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-39. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
##
## Attaching package: 'genefilter'
## The following objects are masked from 'package:yardstick':
##
## sens, spec
## The following object is masked from 'package:readr':
##
## spec
## Loading required package: BiocParallel
batch_adjust <- function(data_fold,resp, x,y){
ols_data <- select(data_fold, resp,starts_with(x), starts_with(y))
ols_matrix <- as.matrix(t(ols_data))
dimnames(ols_matrix)<- NULL
data_fold$SITE_ID_L <- as.factor(data_fold$SITE_ID_L)##have to be changed into factor before combat
data_fold$SITE_ID_L <- droplevels(data_fold$SITE_ID_L)##drop the empty levels of a factor
ols_matrix_com <- ComBat(dat = ols_matrix,batch = data_fold$SITE_ID_L)
ols_data_com <- data.frame(t(ols_matrix_com))
colnames(ols_data_com) <- colnames(ols_data)
return(ols_data_com)
}
# if only one type of data is adjusted (e.g., DTI only requires FA but sMRI has two corticol and sub, rs has two within and bet networks)
batch_adjustOne <- function(data_fold,resp, x){
ols_data <- select(data_fold, resp,starts_with(x))
ols_matrix <- as.matrix(t(ols_data))
dimnames(ols_matrix)<- NULL
data_fold$SITE_ID_L <- as.factor(data_fold$SITE_ID_L)##have to be changed into factor before combat
data_fold$SITE_ID_L <- droplevels(data_fold$SITE_ID_L)##drop the empty levels of a factor
ols_matrix_com <- ComBat(dat = ols_matrix,batch = data_fold$SITE_ID_L)
ols_data_com <- data.frame(t(ols_matrix_com))
colnames(ols_data_com) <- colnames(ols_data)
return(ols_data_com)
}
## look at the contrast of MID and SST
contrast_vec_MID <- MID_data %>%
select(.,ends_with("_ROI_Left.Cerebellum.Cortex")) %>%
colnames() %>%
str_remove_all(.,"_ROI_Left.Cerebellum.Cortex")
contrast_vec_SST <- SST_data %>%
select(.,ends_with("_ROI_Left.Cerebellum.Cortex")) %>%
colnames() %>%
str_remove_all(.,"_ROI_Left.Cerebellum.Cortex")
## extract the contrasts for analysis
data_analysis <- data_all %>%
select(starts_with("X2backvs0back_ROI_"),
starts_with("par"),
starts_with("Within"),
all_of(Resp_Var),
all_of(subj_info),
starts_with("antiRewVsNeu" ),
starts_with("anystopvscorrectgo" ),
starts_with("ASEG"),
starts_with("Dest_Thick"))
note 2nd time point doesn’t have “NIHTBX_LIST_UNCORRECTED” “NIHTBX_CARDSORT_UNCORRECTED” “PEA_WISCV_TRS”
plotRes <- Resp_Var %>% map(.,~ggplot(data = data_all,
mapping = aes_string(x = data_all[[.]], color = "EVENTNAME")) +
geom_density() +
labs(x= resp_var_plotting$short_name[which(resp_var_plotting$response==.)]) +
labs(y= "") +
scale_color_discrete(name ="",
breaks=c("2_year_follow_up_y_arm_1", "baseline_year_1_arm_1"),
labels=c("Follow Up", "Baseline")) +
theme_classic())
ggpubr::ggarrange(plotlist=plotRes, common.legend = TRUE)
## Warning: Removed 3040 rows containing non-finite values (stat_density).
## Removed 3040 rows containing non-finite values (stat_density).
## Warning: Removed 428 rows containing non-finite values (stat_density).
## Warning: Removed 320 rows containing non-finite values (stat_density).
## Warning: Removed 371 rows containing non-finite values (stat_density).
## Warning: Removed 343 rows containing non-finite values (stat_density).
## Warning: Removed 455 rows containing non-finite values (stat_density).
## Warning: Removed 410 rows containing non-finite values (stat_density).
## Warning: Removed 474 rows containing non-finite values (stat_density).
## Warning: Removed 6659 rows containing non-finite values (stat_density).
First we removed sites that have fewer than 100 participants. But none has fewer than 100.
The data structure of the analysis is comprised of four parts:
1) training set (1st layer training set): predicting cognition from each
modality via the elastic net,
validation set (2nd layer training set): combining predicted values from all modalities via the random forest,
test set of the baseline period,
test set of the second year follow up.
To protect data leakage, we select the subjects with the same subject key for the baseline test set and the followup test set.
site_count <- data_analysis %>% count(SITE_ID_L)
baseline_data <- data_all %>% filter(EVENTNAME == "baseline_year_1_arm_1")
followup_data <- data_all %>% filter(EVENTNAME == "2_year_follow_up_y_arm_1")
## all sites have more than 100!
baseline_analysis <- baseline_data %>% filter (SITE_ID_L != site_count$SITE_ID_L[which(site_count$n< 100)])
followup_analysis <- followup_data %>% filter (SITE_ID_L != site_count$SITE_ID_L[which(site_count$n< 100)])
subj_diff <- setdiff(baseline_analysis$SUBJECTKEY,followup_analysis$SUBJECTKEY)
baseline_trainvalid <- baseline_analysis %>% subset(.,SUBJECTKEY %in% subj_diff)
baseline_test <- baseline_analysis %>% subset(.,SUBJECTKEY %in% followup_analysis$SUBJECTKEY)
set.seed(123456)
### divide the training dataset into two
### one for enet tuning and one for random forest tuning
baseline_two_fold <-vfold_cv(data =baseline_trainvalid, v=2,repeats = 1)
# training set (1st layer training set): predicting cognition from each modality via the elastic net,
dim(analysis(baseline_two_fold$splits[[1]]))[1]
## [1] 3041
# validation set (2nd layer training set): combining predicted values from all modalities via the random forest,
dim(assessment(baseline_two_fold$splits[[1]]))[1]
## [1] 3042
# test set of the baseline period,
dim(baseline_test)[1]
## [1] 5622
# test set of the second year follow up
dim(followup_analysis)[1]
## [1] 5656
Here we used Combat, an empirical Bayesian method, to adjust for batch effects (separately for the train and test). Because Nback, SST and MID task-based fMRI are contrast data, batch effects are negligible. And accordingly, we decided not to apply combat on the task-based fMRI, and only applied it on the resting state mri, structure mri and DTI. https://www.biorxiv.org/content/10.1101/309260v1.full.pdf
# for models that require no Combat
enet_processing_nocom <-
function(resp_var,
split_train = analysis(baseline_two_fold$splits[[1]]),
split_valid = assessment(baseline_two_fold$splits[[1]]),
split_test = baseline_test,followup_test,contrast_name)
{
data_train <- split_train %>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(contrast_name)) %>%
drop_na() %>%
IQR_remove()
data_valid <- split_valid %>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(contrast_name)) %>%
drop_na() %>%
IQR_remove()
baseline_test_set <- split_test %>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(contrast_name)) %>%
drop_na() %>%
IQR_remove()
followup_test_set <- followup_test %>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(contrast_name)) %>%
drop_na() %>%
IQR_remove()
ols_data_train <- select(data_train, resp_var, starts_with(contrast_name))
ols_data_valid <- select(data_valid,resp_var,starts_with(contrast_name))
ols_test_baseline <- select(baseline_test_set,resp_var,starts_with(contrast_name))
ols_test_followup <- select(followup_test_set,resp_var,starts_with(contrast_name))
return(list(ols_data_train=ols_data_train,
ols_data_valid=ols_data_valid,
ols_test_baseline=ols_test_baseline,
ols_test_followup=ols_test_followup,
data_valid_id=tibble(SUBJECTKEY=data_valid$SUBJECTKEY),
baseline_test_id=tibble(SUBJECTKEY=baseline_test_set$SUBJECTKEY),
followup_test_id=tibble(SUBJECTKEY=followup_test_set$SUBJECTKEY),
data_train_id=tibble(SUBJECTKEY=data_train$SUBJECTKEY)))
}
# for models that require Combat
enet_processing_com <-
function(resp_var,
split_train = analysis( baseline_two_fold$splits[[1]]),
split_valid = assessment(baseline_two_fold$splits[[1]]),
split_test=baseline_test,followup_test, measure_one,measure_two)
{
data_train <- split_train %>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one),
starts_with(measure_two)) %>%
drop_na() %>%
IQR_remove()
data_valid <- split_valid %>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one),
starts_with(measure_two)) %>%
drop_na()%>%
IQR_remove()
baseline_test_set <- split_test%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one),
starts_with(measure_two))%>%
drop_na()%>%
IQR_remove()
followup_test_set <- followup_test%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one),
starts_with(measure_two))%>%
drop_na()%>%
IQR_remove()
ols_data_train <-
batch_adjust(data_fold = data_train,
resp = resp_var,
x=measure_one,
y=measure_two)
ols_data_valid <-
batch_adjust(data_fold = data_valid,
resp = resp_var,
x=measure_one,
y=measure_two)
ols_test_baseline <-
batch_adjust(data_fold = baseline_test_set,
resp = resp_var,
x=measure_one,
y=measure_two)
ols_test_followup <-
batch_adjust(data_fold = followup_test_set,
resp = resp_var,
x=measure_one,
y=measure_two)
return(list(ols_data_train=ols_data_train,
ols_data_valid=ols_data_valid,
ols_test_baseline=ols_test_baseline,
ols_test_followup=ols_test_followup,
data_valid_id=tibble(SUBJECTKEY=data_valid$SUBJECTKEY),
baseline_test_id=tibble(SUBJECTKEY=baseline_test_set$SUBJECTKEY),
followup_test_id=tibble(SUBJECTKEY=followup_test_set$SUBJECTKEY),
data_train_id=tibble(SUBJECTKEY=data_train$SUBJECTKEY)))
}
# for models that require Combat that has one type of data. so far this was used for DTI only. This is bc DTI only one prefix ("fa")
enet_processing_comOne <-
function(resp_var,
split_train = analysis( baseline_two_fold$splits[[1]]),
split_valid = assessment(baseline_two_fold$splits[[1]]),
split_test=baseline_test,followup_test, measure_one)
{
data_train <- split_train %>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one)) %>%
drop_na() %>%
IQR_remove()
data_valid <- split_valid %>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one)) %>%
drop_na()%>%
IQR_remove()
baseline_test_set <- split_test%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one))%>%
drop_na()%>%
IQR_remove()
followup_test_set <- followup_test%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one))%>%
drop_na()%>%
IQR_remove()
ols_data_train <-
batch_adjustOne(data_fold = data_train,
resp = resp_var,
x=measure_one)
ols_data_valid <-
batch_adjustOne(data_fold = data_valid,
resp = resp_var,
x=measure_one)
ols_test_baseline <-
batch_adjustOne(data_fold = baseline_test_set,
resp = resp_var,
x=measure_one)
ols_test_followup <-
batch_adjustOne(data_fold = followup_test_set,
resp = resp_var,
x=measure_one)
return(list(ols_data_train=ols_data_train,
ols_data_valid=ols_data_valid,
ols_test_baseline=ols_test_baseline,
ols_test_followup=ols_test_followup,
data_valid_id=tibble(SUBJECTKEY=data_valid$SUBJECTKEY),
baseline_test_id=tibble(SUBJECTKEY=baseline_test_set$SUBJECTKEY),
followup_test_id=tibble(SUBJECTKEY=followup_test_set$SUBJECTKEY),
data_train_id=tibble(SUBJECTKEY=data_train$SUBJECTKEY)))
}
this function will fitting enet models and to apply them the 2nd layer model and to the test sets
enet_tuning <- function(resp_var, data_list){
#recipe train and test data are from ols as the variables are the same
norm_recipe <-recipe(as.formula(paste0(resp_var, "~ .")),
data =data_list[["ols_data_train"]] ) %>%
step_dummy(all_nominal()) %>%
step_center(all_predictors()) %>%
step_scale(all_predictors()) %>%
# estimate the means and standard deviations
prep(training = data_list[["ols_data_train"]], retain = TRUE)
## model
glmn_mod <- linear_reg(penalty = tune() ,mixture =tune()) %>%
set_engine("glmnet")
set.seed(123456)
# 10fold split within the training data
all_rs <- rsample::vfold_cv(data=data_list[["ols_data_train"]],
v = 10, repeats = 1)
glmn_wfl <- workflow() %>%
add_recipe(norm_recipe) %>%
add_model(glmn_mod)
## penalty is lambda, mixture is alpha
## log10 transformed to get lambda between 10^-10 to 10^1
## this expands the default values (10^-10 to 10^0) from tidymodels https://github.com/tidymodels/dials/issues/140
glmn_set <- dials::parameters(penalty(range = c(-10,1), trans = log10_trans()),mixture())
## 200 levels of lambda and 11 levels of alpha (0, .1 to 1)
glmn_grid <- grid_regular(glmn_set, levels = c(200,11))
ctrl <- control_grid(save_pred = TRUE, verbose = TRUE)
glmn_tune <- tune_grid(glmn_wfl,
resamples = all_rs,
grid = glmn_grid,
metrics = metric_set(mae),
control = ctrl)
##select the best tuned lambda and alpha
best_glmn <- select_best(glmn_tune, metric = "mae")
## use the best tunes parameters to fit the test data
glmn_wfl_final <-
glmn_wfl %>%
finalize_workflow(best_glmn) %>%
parsnip::fit(data = data_list[["ols_data_train"]]) ##clearly another package is blocking this function
### this is the prediction for random forest tuning
pred_glmn <- predict(glmn_wfl_final, new_data =data_list[["ols_data_valid"]])
###this is the prediction for test set
pred_test_baseline <- predict(glmn_wfl_final, new_data =data_list[["ols_test_baseline"]])
pred_test_followup <- predict(glmn_wfl_final, new_data =data_list[["ols_test_followup"]])
##compute the correlation between predicted results and the test data
performance_glmn <- cor(pred_glmn$.pred,data_list[["ols_data_valid"]][[resp_var]])
performance_baseline <- cor(pred_test_baseline$.pred,data_list[["ols_test_baseline"]][[resp_var]])
performance_followup <- cor(pred_test_followup$.pred,data_list[["ols_test_followup"]][[resp_var]])
best_glmn <- best_glmn %>%
mutate(performance=performance_glmn,
baseline_performance = performance_baseline,
followup_performance=performance_followup)
return(list("param_tune"=best_glmn,
"predicted"=tibble(SUBJECTKEY=data_list[["data_valid_id"]]$SUBJECTKEY,
"pred"=pred_glmn$.pred),
"predicted_baseline"=tibble(SUBJECTKEY=data_list[["baseline_test_id"]]$SUBJECTKEY,
"pred"=pred_test_baseline$.pred),
"predicted_followup"=tibble(SUBJECTKEY=data_list[["followup_test_id"]]$SUBJECTKEY,
"pred"=pred_test_followup$.pred),
"obs_train"=length(data_list[["data_train_id"]]$SUBJECTKEY),
"obs_valid"=length(data_list[["data_valid_id"]]$SUBJECTKEY)))
}
Elastic net tuning is saved together with random forest results.
Nback_enet_data <- resp_names %>%
map(.,~enet_processing_nocom(resp_var = .,contrast_name = "X2backvs0back_ROI_",followup_test = followup_analysis))
##change this code before redo the random forest
##the codes are changed but not run except for the first one
SST_enet_data <- resp_names %>%
map(.,~enet_processing_nocom(resp_var = .,
followup_test = followup_analysis,
contrast_name = "anystopvscorrectgo_ROI_"))
MID_enet_data <- resp_names %>%
map(.,~enet_processing_nocom(resp_var = .,
followup_test = followup_analysis,
contrast_name = "antiRewVsNeu_ROI_"))
DTI_enet_data <- resp_names %>%
map(.,~enet_processing_comOne(resp_var = .,
followup_test = followup_analysis,
measure_one = "FA_"))
smri_enet_data <- resp_names %>%
map(.,~enet_processing_com(resp_var = .,
followup_test = followup_analysis,
measure_one = "ASEG_",
measure_two = "Dest_Thick_"))
rsmri_enet_data <- resp_names %>%
map(.,~enet_processing_com(resp_var = .,
followup_test = followup_analysis,
measure_one = "par",
measure_two = "Within"))
enet_data_all <- list("Nback"=Nback_enet_data,
"SST"=SST_enet_data,
"MID"=MID_enet_data,
"rsmri"=rsmri_enet_data,
"smri"=smri_enet_data,
"DTI"=DTI_enet_data)
##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)
Nback_results <- resp_names %>%
future_map(.,~enet_tuning(resp_var =.,
data_list = Nback_enet_data[[.]]),
.options = furrr_options(seed=TRUE))
##change this code before redo the random forest
##the codes are changed but not run except for the first one
SST_results <- resp_names %>%
future_map(.,~enet_tuning(resp_var =.,
data_list = SST_enet_data[[.]]),
.options = furrr_options(seed=TRUE))
MID_results <- resp_names %>%
future_map(.,~enet_tuning(resp_var =.,
data_list = MID_enet_data[[.]]),
.options = furrr_options(seed=TRUE))
DTI_results <- resp_names %>%
future_map(.,~enet_tuning(resp_var =.
,data_list = DTI_enet_data[[.]]),
.options = furrr_options(seed=TRUE))
smri_results <- resp_names %>%
future_map(.,~enet_tuning(resp_var =.
,data_list = smri_enet_data[[.]]),
.options = furrr_options(seed=TRUE))
rsmri_results <- resp_names %>%
future_map(.,~enet_tuning(resp_var =.
,data_list = rsmri_enet_data[[.]]),
.options = furrr_options(seed=TRUE))
enet_results_all <- list("Nback_results"=Nback_results,
"SST_results"=SST_results,
"MID_results"=MID_results,
"rsmri_results"=rsmri_results,
"smri_results"=smri_results,
"DTI_results"=DTI_results)
modality_vec <- c("Nback","SST","MID","rsmri","smri","DTI")
modality_results <- paste0(modality_vec,"_results")
datasplits_number = function(data_list, resp_vec, resp_plotting){
number_datasplits <- modality_results %>% map(.,function(modality_resp=.){
data_input <- data_list[[modality_resp]]
one_modality <- resp_vec %>% map(.,function(resp=.){
one_allset <- tibble(enet_train = data_input[[resp]][["obs_train"]],
stack_train=data_input[[resp]][["obs_valid"]],
baseline=dim(data_input[[resp]][["predicted_baseline"]])[1],
followup=dim(data_input[[resp]][["predicted_followup"]])[1],
response=resp)
return(one_allset)
})
})
names(number_datasplits)<- modality_results
number_datasplits <- number_datasplits %>% map(.,~do.call(rbind,.))
number_datasplits <- number_datasplits %>% map(.,~left_join(.,resp_plotting,by="response"))
modality_results %>% map(.,~ mutate(number_datasplits[[.]],
modality=str_remove_all(.,"_results"))%>%
select(modality,
enet_train,
stack_train,
baseline,
followup,
short_name)%>%
kableExtra::kbl(caption = paste0("Sample size of all the data sets ")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria") # seems to have problem knitting
)
}
datasplits_number(data_list = enet_results_all,resp_vec = resp_names,resp_plotting = resp_var_plotting)
[[1]]
modality | enet_train | stack_train | baseline | followup | short_name |
---|---|---|---|---|---|
Nback | 1417 | 1385 | 2734 | 3183 | 2-back Work Mem |
Nback | 1397 | 1364 | 2719 | 3078 | Pic Vocab |
Nback | 1398 | 1363 | 2709 | 3131 | Flanker |
Nback | 1397 | 1361 | 2708 | 3118 | Pattern Speed |
Nback | 1397 | 1364 | 2717 | 3129 | Seq Memory |
Nback | 1392 | 1364 | 2712 | 3073 | Reading Recog |
Nback | 1395 | 1361 | 2642 | 3175 | Little Man |
Nback | 1392 | 1351 | 2682 | 3123 | Audi Verbal |
Nback | 1162 | 1148 | 2285 | 2738 | integration SSRT |
modality | enet_train | stack_train | baseline | followup | short_name |
---|---|---|---|---|---|
SST | 1403 | 1428 | 2755 | 3001 | 2-back Work Mem |
SST | 1447 | 1458 | 2854 | 2987 | Pic Vocab |
SST | 1447 | 1452 | 2848 | 3039 | Flanker |
SST | 1445 | 1452 | 2848 | 3028 | Pattern Speed |
SST | 1447 | 1456 | 2850 | 3040 | Seq Memory |
SST | 1439 | 1458 | 2846 | 2983 | Reading Recog |
SST | 1440 | 1452 | 2772 | 3085 | Little Man |
SST | 1444 | 1446 | 2819 | 3022 | Audi Verbal |
SST | 1469 | 1480 | 2872 | 3088 | integration SSRT |
modality | enet_train | stack_train | baseline | followup | short_name |
---|---|---|---|---|---|
MID | 1599 | 1594 | 3066 | 3221 | 2-back Work Mem |
MID | 1647 | 1635 | 3213 | 3222 | Pic Vocab |
MID | 1646 | 1628 | 3210 | 3281 | Flanker |
MID | 1647 | 1630 | 3206 | 3270 | Pattern Speed |
MID | 1651 | 1635 | 3211 | 3279 | Seq Memory |
MID | 1643 | 1634 | 3205 | 3215 | Reading Recog |
MID | 1641 | 1631 | 3132 | 3326 | Little Man |
MID | 1639 | 1621 | 3175 | 3261 | Audi Verbal |
MID | 1295 | 1323 | 2579 | 2771 | integration SSRT |
modality | enet_train | stack_train | baseline | followup | short_name |
---|---|---|---|---|---|
rsmri | 1871 | 1900 | 3556 | 4051 | 2-back Work Mem |
rsmri | 2041 | 2056 | 3862 | 4270 | Pic Vocab |
rsmri | 2037 | 2045 | 3858 | 4350 | Flanker |
rsmri | 2039 | 2052 | 3853 | 4331 | Pattern Speed |
rsmri | 2043 | 2054 | 3860 | 4347 | Seq Memory |
rsmri | 2041 | 2054 | 3846 | 4258 | Reading Recog |
rsmri | 2034 | 2054 | 3756 | 4417 | Little Man |
rsmri | 2034 | 2041 | 3823 | 4327 | Audi Verbal |
rsmri | 1503 | 1535 | 2970 | 3438 | integration SSRT |
modality | enet_train | stack_train | baseline | followup | short_name |
---|---|---|---|---|---|
smri | 2133 | 2178 | 4028 | 4380 | 2-back Work Mem |
smri | 2467 | 2484 | 4622 | 4724 | Pic Vocab |
smri | 2465 | 2484 | 4619 | 4800 | Flanker |
smri | 2464 | 2478 | 4611 | 4785 | Pattern Speed |
smri | 2467 | 2485 | 4619 | 4805 | Seq Memory |
smri | 2455 | 2482 | 4608 | 4712 | Reading Recog |
smri | 2464 | 2488 | 4498 | 4881 | Little Man |
smri | 2450 | 2472 | 4571 | 4775 | Audi Verbal |
smri | 1661 | 1706 | 3292 | 3624 | integration SSRT |
modality | enet_train | stack_train | baseline | followup | short_name |
---|---|---|---|---|---|
DTI | 2104 | 2111 | 3742 | 4117 | 2-back Work Mem |
DTI | 2324 | 2312 | 4159 | 4369 | Pic Vocab |
DTI | 2321 | 2313 | 4154 | 4439 | Flanker |
DTI | 2322 | 2308 | 4151 | 4428 | Pattern Speed |
DTI | 2326 | 2312 | 4155 | 4439 | Seq Memory |
DTI | 2324 | 2312 | 4143 | 4357 | Reading Recog |
DTI | 2317 | 2311 | 4054 | 4510 | Little Man |
DTI | 2320 | 2301 | 4126 | 4412 | Audi Verbal |
DTI | 1637 | 1650 | 3051 | 3424 | integration SSRT |
enetXplorer allows us to use permutation to test the significance of each feature
### NOT RUN by default, takes long time
enet_results_xplorer <- enet_results_all
names(enet_results_xplorer) <- modality_vec
fitting_enetxplorer <- function(resp_vec , modality_var,data_input,tuning_input){
data_split=data_input[[modality_var]]
enet_train <- resp_vec %>% map(., ~select(data_split[[.]][["ols_data_train"]],-all_of(.)))
enet_test <- resp_vec %>% map(., ~select(data_split[[.]][["ols_test_baseline"]],-all_of(.)))
enet_resp_train <- resp_vec %>% map(., ~select(data_split[[.]][["ols_data_train"]],all_of(.)))
enet_resp_test <- resp_vec %>% map(., ~select(data_split[[.]][["ols_test_baseline"]],all_of(.)))
doParallel::registerDoParallel(cores = 20)
fit_enet_all_IQR <-resp_vec %>%
future_map(.,~eNetXplorer(x = as.matrix(enet_train[[.]]) ,
y = as.vector(enet_resp_train[[.]][[.]]),
alpha =tuning_input[[modality_var]][[.]][["param_tune"]][["mixture"]],
n_fold = 10,
#nlambda.ext = 10,
#nlambda = 10, #uses the default 100 levels with the data-driven range defined by glmnet
scaled = TRUE,
seed = 123456),
.options = furrr_options(seed=TRUE))
return(fit_enet_all_IQR)
}
##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)
enet_tuning_fit <- modality_vec %>%
map(.,~fitting_enetxplorer(resp_vec = resp_names,
modality_var = .,
data_input = enet_data_all,
tuning_input = enet_results_xplorer))
names(enet_tuning_fit)<- modality_vec
here we show hyperparameters from enetXplorer.
Alpha is forced to be the same with the tidymodels approach. Lambda via
glmnet adaptive range is close to the tidymodels approach. see more on
this approach here
https://stats.stackexchange.com/questions/174897/choosing-the-range-and-grid-density-for-regularization-parameter-in-lasso
The accuracy is out-of-bag correlations between predicted and observed
within the training set. P value is based on the permutation test.
enet_fit_table <- function(modality_input,fit_input, tune_input,resp_input, plotting_names ){
fit_enet_all_IQR <- fit_input[[modality_input]]
glmn_tuning_all_IQR <- tune_input[[modality_input]]
lambdas_all <- vector("list", length = length(resp_input))
names(lambdas_all)<- resp_input
lambdas_all_best <- vector("list", length = length(resp_input))
names(lambdas_all_best)<- resp_input
summary_enet_all <- vector("list", length = length(resp_input))
names(summary_enet_all) <- resp_input
for(i in 1:length(resp_input)){
lambdas_all[[resp_input[i]]] <- fit_enet_all_IQR[[resp_input[i]]][["lambda_values"]]
lambdas_all_best[[resp_input[i]]] <- fit_enet_all_IQR[[resp_input[i]]][["best_lambda"]]
summary_enet_all[[resp_input[i]]]<- as_tibble(summary(fit_enet_all_IQR[[resp_input[i]]])[[2]]) %>% slice(1)
}
summary_enet_all_test <- summary_enet_all %>% bind_rows() %>%
mutate(respones = plotting_names$short_name)
summary_vec <- colnames(summary_enet_all_test)[-5]
summary_enet_all_test <- summary_vec %>% map(.,function(var_input=.){
summary_enet_all_test[[paste0(var_input)]] <- round(summary_enet_all_test[[paste0(var_input)]],4)
})
names(summary_enet_all_test)<- summary_vec
summary_enet_all_test <- summary_enet_all_test %>% bind_rows()%>%
mutate(respones = plotting_names$short_name)%>%
rename(.,Alpha = alpha, `Best-tune lambda` = lambda.max,
`Accuracy (Pearson r)` = QF.est,
`P-value` = model.vs.null.pval) %>%
#knitr::kable(caption = paste0("EnetXplorer fitting of ", modality_input))
kableExtra::kbl(caption = paste0("EnetXplorer fitting of ", modality_input)) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
# seems to have problem knitting
##problem solved by adding results='asis' option in knitting
}
modality_vec %>% map(.,~enet_fit_table(modality_input=.,
fit_input = enet_tuning_fit,
tune_input = enet_results_xplorer,
resp_input = resp_names,
plotting_names = resp_var_plotting))
[[1]]
Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
---|---|---|---|---|
0.0 | 0.7398 | 0.4706 | 0.0004 | 2-back Work Mem |
0.0 | 0.8318 | 0.3464 | 0.0004 | Pic Vocab |
0.0 | 1.5030 | 0.1769 | 0.0004 | Flanker |
0.3 | 0.1277 | 0.1183 | 0.0004 | Pattern Speed |
0.0 | 4.0530 | 0.1895 | 0.0004 | Seq Memory |
0.0 | 2.4728 | 0.2959 | 0.0004 | Reading Recog |
0.1 | 0.1947 | 0.2426 | 0.0004 | Little Man |
0.1 | 0.2181 | 0.1645 | 0.0004 | Audi Verbal |
0.0 | 0.8539 | 0.1153 | 0.0012 | integration SSRT |
Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
---|---|---|---|---|
0.5 | 0.0254 | 0.1282 | 0.0008 | 2-back Work Mem |
0.2 | 0.0484 | 0.1658 | 0.0004 | Pic Vocab |
0.5 | 0.0476 | 0.1196 | 0.0012 | Flanker |
0.0 | 0.0593 | 0.0581 | 0.0496 | Pattern Speed |
0.1 | 0.0018 | 0.0052 | 0.4390 | Seq Memory |
0.0 | 0.0062 | 0.0741 | 0.0216 | Reading Recog |
0.8 | 0.0314 | 0.0936 | 0.0064 | Little Man |
0.9 | 0.0518 | 0.0549 | 0.0396 | Audi Verbal |
0.0 | 0.5711 | 0.3137 | 0.0004 | integration SSRT |
Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
---|---|---|---|---|
0.0 | 0.6653 | 0.1379 | 0.0004 | 2-back Work Mem |
0.0 | 0.8611 | 0.2202 | 0.0004 | Pic Vocab |
0.0 | 2.6097 | 0.0704 | 0.0112 | Flanker |
0.0 | 2.0368 | 0.0783 | 0.0028 | Pattern Speed |
0.0 | 2.8319 | 0.0815 | 0.0064 | Seq Memory |
0.1 | 0.1873 | 0.2257 | 0.0004 | Reading Recog |
0.1 | 0.1866 | 0.0916 | 0.0040 | Little Man |
0.0 | 1.7944 | 0.1472 | 0.0004 | Audi Verbal |
1.0 | 0.0233 | 0.0760 | 0.0224 | integration SSRT |
Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
---|---|---|---|---|
0 | 0.0512 | 0.1677 | 0.0004 | 2-back Work Mem |
0 | 0.5142 | 0.2210 | 0.0004 | Pic Vocab |
0 | 3.0027 | 0.0962 | 0.0004 | Flanker |
0 | 0.3233 | 0.0822 | 0.0032 | Pattern Speed |
0 | 1.1274 | 0.0997 | 0.0004 | Seq Memory |
0 | 0.0907 | 0.1959 | 0.0004 | Reading Recog |
1 | 0.0304 | 0.1415 | 0.0004 | Little Man |
0 | 3.9336 | 0.0995 | 0.0008 | Audi Verbal |
1 | 0.0351 | 0.0645 | 0.0328 | integration SSRT |
Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
---|---|---|---|---|
0.0 | 4.8930 | 0.1455 | 0.0004 | 2-back Work Mem |
0.2 | 0.0418 | 0.2112 | 0.0004 | Pic Vocab |
1.0 | 0.0328 | 0.0754 | 0.0060 | Flanker |
0.0 | 1.8544 | 0.0612 | 0.0100 | Pattern Speed |
0.0 | 2.7896 | 0.1288 | 0.0004 | Seq Memory |
0.0 | 0.9688 | 0.1869 | 0.0004 | Reading Recog |
0.0 | 2.9783 | 0.1447 | 0.0004 | Little Man |
0.1 | 0.2536 | 0.0920 | 0.0004 | Audi Verbal |
0.0 | 4.6013 | 0.0647 | 0.0144 | integration SSRT |
Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
---|---|---|---|---|
0.0 | 0.7204 | 0.1089 | 0.0004 | 2-back Work Mem |
0.0 | 0.0129 | 0.2142 | 0.0004 | Pic Vocab |
0.2 | 0.0295 | 0.1105 | 0.0004 | Flanker |
0.0 | 0.3508 | 0.1073 | 0.0004 | Pattern Speed |
0.0 | 0.4257 | 0.1005 | 0.0004 | Seq Memory |
1.0 | 0.0019 | 0.1863 | 0.0004 | Reading Recog |
0.1 | 0.0544 | 0.1337 | 0.0004 | Little Man |
0.0 | 0.0939 | 0.0991 | 0.0004 | Audi Verbal |
0.0 | 1.5825 | -0.0135 | 0.4242 | integration SSRT |
extract_tibble <- function(elastic_mod, alpha_index) {
variable <- elastic_mod$feature_coef_wmean[, alpha_index] %>% names()
wmean <- elastic_mod$feature_coef_wmean[, alpha_index]
wsd <- elastic_mod$feature_coef_wsd[, alpha_index]
null_wmean <- elastic_mod$null_feature_coef_wmean[, alpha_index]
null_wsd <- elastic_mod$null_feature_coef_wsd[, alpha_index]
pvalue <- elastic_mod$feature_coef_model_vs_null_pval[, alpha_index]
tib <- tibble(variable, wmean, wsd, null_wmean, null_wsd, pvalue)
tib <- tib %>%
gather(key = 'placeholder', value = 'value', wmean, wsd, null_wmean, null_wsd) %>%
mutate(type = ifelse(str_detect(placeholder, 'null'), 'null', 'target'),
placeholder = (str_remove(placeholder, 'null_'))) %>%
mutate(type = factor(type, labels = c('Null', 'Target'))) %>%
spread(placeholder, value)
tib
}
modality_type_vec <- c("Nback","SST","MID")
str_type_vec <- c("X2backvs0back_ROI_","anystopvscorrectgo_ROI_","antiRewVsNeu_ROI_")
enet_interval_plot <-
function(data_input,
tuning_input,
modality_input,
str_input,
resp_input,
plotting_names=resp_var_plotting){
fit_enet_all_IQR <- data_input[[modality_input]]
names(tuning_input)<- modality_vec
glmn_tuning_all_IQR <- tuning_input[[modality_input]]
coefs_enet_all <- resp_input %>%
map(.,
~extract_tibble(
fit_enet_all_IQR[[.]],
alpha_index = paste0("a",glmn_tuning_all_IQR[[.]][["param_tune"]][["mixture"]])))
coefs_enet_all <- coefs_enet_all %>%
map(.,~filter(.,pvalue < 0.05) %>%
mutate(.,type = ifelse(type == 'Null',
'Null permuted models',
'Target models'),
roi = str_remove(variable, str_input))%>%
left_join( .,new_shorter_names,by="roi"))
roi_num_enet <- coefs_enet_all %>% map(.,~dim(.)[1])
max_roi_enet <- max(as.numeric(roi_num_enet))
coefs_enet_test <- coefs_enet_all[[resp_input[1]]] %>%
group_by(type)
coefs_enet_test <- coefs_enet_test%>% nest(-type)
coefs_enet_test[[2]][[1]] <-
coefs_enet_test[[2]][[1]] %>%
mutate(direction1 =
ifelse(coefs_enet_test[[2]][[1]]$wmean >=
median(coefs_enet_test[[2]][[1]]$wmean)|roi_num_enet[[resp_input[1]]] <=
floor(max_roi_enet/2),"big","small"))
coefs_enet_test[[2]][[2]] <- coefs_enet_test[[2]][[2]] %>%
mutate(direction1=coefs_enet_test[[2]][[1]]$direction1)
coefs_enet_test <- coefs_enet_test %>% unnest()
coefs_enet_all <- coefs_enet_all%>%map(.,~group_by(.,type))
coefs_enet_all <- coefs_enet_all%>%map(.,~nest(.,-type))
for(i in 1:length(resp_input)){
coefs_enet_all[[resp_input[i]]][["data"]][[2]]<-
coefs_enet_all[[resp_input[i]]][["data"]][[2]] %>%
mutate(direction = ifelse(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean >= median(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean)|roi_num_enet[[resp_input[i]]] <= floor(max_roi_enet/2),"big","small"))
coefs_enet_all[[resp_input[i]]][["data"]][[1]] <-
coefs_enet_all[[resp_input[i]]][["data"]][[1]] %>%
mutate(direction=coefs_enet_all[[resp_input[i]]][["data"]][[2]]$direction)
}
coefs_enet_all <- coefs_enet_all %>%map(.,~unnest(.))
resp_input %>% map(.,~ggplot(coefs_enet_all[[.]],
aes(x = fct_reorder(roiShort, wmean),
y = wmean,
ymax = wmean + 2 * wsd,
ymin = wmean - 2 * wsd,
col = type)) +
geom_pointrange(fatten = 0.5, key_glyph = 'point') +
scale_y_continuous(labels = numform::ff_num(zero = 0, digits = 2)) +
scale_color_grey(start = 0.7, end = 0.5) +
coord_flip() +
guides(colour = guide_legend(override.aes = list(size = 2.5)))+
labs(x = 'Explanatory Variables (Brain Regions)',
y = 'Averaged Coefficient Across Models (±2 SD)',
col = 'Model type',
title = paste0(modality_input, " task-fMRI Predicting\n",
plotting_names$longer_name[[which(plotting_names$response==.)]])) +
facet_wrap(~ direction, scales = 'free_y') +
scale_color_manual(values = c("#56B4E9", "black"),labels = c("Permuted Null", "Target")) +
theme_bw() +
#theme(axis.text.y = element_text(angle = 15)) +
theme(legend.title=element_blank()) +
theme(legend.position = "top") +
theme(
axis.title.x = element_text(size = 15),
axis.text.x = element_text(size = 12),
axis.title.y = element_text(size = 15),
axis.text.y = element_text(size = 12),
legend.text = element_text(size = 15),
plot.title = element_text(size=15)) +
theme(
strip.background = element_blank(),
strip.text.x = element_blank()
)
+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
)
}
map2(.x=modality_type_vec,
.y=str_type_vec,
~enet_interval_plot(data_input=enet_tuning_fit,
tuning_input=enet_results_all,
resp_input = resp_names,
modality_input = .x,
str_input = .y,
plotting_names=resp_var_plotting))
## [[1]]
## [[1]]$TFMRI_NB_ALL_BEH_C2B_RATE
##
## [[1]]$NIHTBX_PICVOCAB_UNCORRECTED
##
## [[1]]$NIHTBX_FLANKER_UNCORRECTED
##
## [[1]]$NIHTBX_PATTERN_UNCORRECTED
##
## [[1]]$NIHTBX_PICTURE_UNCORRECTED
##
## [[1]]$NIHTBX_READING_UNCORRECTED
##
## [[1]]$LMT_SCR_PERC_CORRECT
##
## [[1]]$PEA_RAVLT_LD_TRIAL_VII_TC
##
## [[1]]$TFMRI_SST_ALL_BEH_TOTAL_ISSRT
##
##
## [[2]]
## [[2]]$TFMRI_NB_ALL_BEH_C2B_RATE
##
## [[2]]$NIHTBX_PICVOCAB_UNCORRECTED
##
## [[2]]$NIHTBX_FLANKER_UNCORRECTED
##
## [[2]]$NIHTBX_PATTERN_UNCORRECTED
##
## [[2]]$NIHTBX_PICTURE_UNCORRECTED
##
## [[2]]$NIHTBX_READING_UNCORRECTED
##
## [[2]]$LMT_SCR_PERC_CORRECT
##
## [[2]]$PEA_RAVLT_LD_TRIAL_VII_TC
##
## [[2]]$TFMRI_SST_ALL_BEH_TOTAL_ISSRT
##
##
## [[3]]
## [[3]]$TFMRI_NB_ALL_BEH_C2B_RATE
##
## [[3]]$NIHTBX_PICVOCAB_UNCORRECTED
##
## [[3]]$NIHTBX_FLANKER_UNCORRECTED
##
## [[3]]$NIHTBX_PATTERN_UNCORRECTED
##
## [[3]]$NIHTBX_PICTURE_UNCORRECTED
##
## [[3]]$NIHTBX_READING_UNCORRECTED
##
## [[3]]$LMT_SCR_PERC_CORRECT
##
## [[3]]$PEA_RAVLT_LD_TRIAL_VII_TC
##
## [[3]]$TFMRI_SST_ALL_BEH_TOTAL_ISSRT
enet_interval_plot_smri <- function(data_input,tuning_input,resp_input,plotting_names=resp_var_plotting){
fit_enet_all_IQR <- data_input[["smri"]]
names(tuning_input)<- modality_vec
glmn_tuning_all_IQR <- tuning_input[["smri"]]
coefs_enet_all <- resp_input %>% map(.,~extract_tibble(fit_enet_all_IQR[[.]], alpha_index = paste0("a",glmn_tuning_all_IQR[[.]][["param_tune"]][["mixture"]])))
roi_vec <- coefs_enet_all[[1]][["variable"]]
roi_vec <- gsub("ASEG_","",roi_vec)
roi_vec <- gsub("Dest_Thick_","",roi_vec)
roi_frame <- tibble(variable=coefs_enet_all[[1]][["variable"]],roi=roi_vec)
coefs_enet_all <- coefs_enet_all %>% map(.,~left_join(.,roi_frame,by="variable")%>%filter(.,pvalue < 0.05) %>%
mutate(.,type = ifelse(type == 'Null', 'Null permuted models', 'Target models'))%>%
left_join( .,new_shorter_names,by="roi"))
roi_num_enet <- coefs_enet_all %>% map(.,~dim(.)[1])
max_roi_enet <- max(as.numeric(roi_num_enet))
coefs_enet_test <- coefs_enet_all[[resp_input[1]]] %>%
group_by(type)
coefs_enet_test <- coefs_enet_test%>% nest(-type)
coefs_enet_test[[2]][[1]] <- coefs_enet_test[[2]][[1]] %>%
mutate(direction1 = ifelse(coefs_enet_test[[2]][[1]]$wmean >= median(coefs_enet_test[[2]][[1]]$wmean)|roi_num_enet[[resp_input[1]]] <= floor(max_roi_enet/2),"big","small"))
coefs_enet_test[[2]][[2]] <- coefs_enet_test[[2]][[2]] %>% mutate(direction1=coefs_enet_test[[2]][[1]]$direction1)
coefs_enet_test <- coefs_enet_test %>% unnest()
coefs_enet_all <- coefs_enet_all%>%map(.,~group_by(.,type))
coefs_enet_all <- coefs_enet_all%>%map(.,~nest(.,-type))
for(i in 1:length(resp_input)){
coefs_enet_all[[resp_input[i]]][["data"]][[2]]<-coefs_enet_all[[resp_input[i]]][["data"]][[2]] %>% mutate(direction = ifelse(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean >= median(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean)|roi_num_enet[[resp_input[i]]] <= floor(max_roi_enet/2),"big","small"))
coefs_enet_all[[resp_input[i]]][["data"]][[1]] <- coefs_enet_all[[resp_input[i]]][["data"]][[1]] %>% mutate(direction=coefs_enet_all[[resp_input[i]]][["data"]][[2]]$direction)
}
coefs_enet_all <- coefs_enet_all %>%map(.,~unnest(.))
resp_input %>% map(.,~ggplot(coefs_enet_all[[.]], aes(x = fct_reorder(roiShort, wmean),
y = wmean,
ymax = wmean + 2 * wsd, ymin = wmean - 2 * wsd,
col = type)) +
geom_pointrange(fatten = 0.5, key_glyph = 'point') +
scale_y_continuous(labels = numform::ff_num(zero = 0, digits = 2)) +
scale_color_grey(start = 0.7, end = 0.5) +
coord_flip() +
guides(colour = guide_legend(override.aes = list(size = 2.5)))+
labs(x = 'Explanatory Variables (Brain Regions)',
y = 'Averaged Coefficient Across Models (±2 SD)',
col = 'Model type',
title = paste0("sMRI Predicting\n",plotting_names$longer_name[[which(plotting_names$response==.)]])) +
facet_wrap(~ direction, scales = 'free_y') +
scale_color_manual(values = c("#56B4E9", "black"),labels = c("Permuted Null", "Target")) +
theme_bw() +
#theme(axis.text.y = element_text(angle = 15)) +
theme(legend.title=element_blank()) +
theme(legend.position = "top") +
theme(
axis.title.x = element_text(size = 15),
axis.text.x = element_text(size = 12),
axis.title.y = element_text(size = 15),
axis.text.y = element_text(size = 12),
legend.text = element_text(size = 15),
plot.title = element_text(size=15)) +
theme(
strip.background = element_blank(),
strip.text.x = element_blank()
)
+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
)
}
enet_interval_plot_smri(data_input=enet_tuning_fit,tuning_input=enet_results_all,resp_input = resp_names,plotting_names=resp_var_plotting)
## $TFMRI_NB_ALL_BEH_C2B_RATE
##
## $NIHTBX_PICVOCAB_UNCORRECTED
##
## $NIHTBX_FLANKER_UNCORRECTED
##
## $NIHTBX_PATTERN_UNCORRECTED
##
## $NIHTBX_PICTURE_UNCORRECTED
##
## $NIHTBX_READING_UNCORRECTED
##
## $LMT_SCR_PERC_CORRECT
##
## $PEA_RAVLT_LD_TRIAL_VII_TC
##
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT
roi_names <- colnames(select(rsmri_par_data, starts_with("parAuditory")))%>% str_remove_all(.,"parAuditory")
roi_names_all <- c("Auditory",roi_names)
par_roi_names_all <- paste0("par", roi_names_all)
matrix_prepare <- function(data_list){
heat_matrix <- matrix(rep(0,13*13),nrow = 13)
for(i in 1:dim(data_list)[1]){
if(str_detect(data_list$roi[i],paste0("^","par"))==TRUE){
coor_matrix_1 <- which(as.vector(par_roi_names_all %>% map(.,~str_detect(data_list$roi[i],paste0("\\A",.)))%>%do.call(rbind,.)) ==TRUE)
coor_matrix_2 <- which(as.vector(roi_names_all %>% map(.,~str_detect(data_list$roi[i],paste0(.,"$")))%>%do.call(rbind,.)) ==TRUE)
heat_matrix[coor_matrix_1,coor_matrix_2]<- heat_matrix[coor_matrix_2,coor_matrix_1]<- data_list$estimate[i]
}
if(str_detect(data_list$roi[i],paste0("^","Within"))==TRUE){
coor_matrix <- which(as.vector(roi_names_all %>% map(.,~str_detect(data_list$roi[i],paste0("Within",.)))%>%do.call(rbind,.)) ==TRUE)
heat_matrix[coor_matrix,coor_matrix]<- data_list$estimate[i]
}
}
heat_matrix[lower.tri(heat_matrix)]<- NA
return(heat_matrix)
}
enet_interval_plot_rsmri <- function(data_input,tuning_input,resp_input,plotting_names=resp_var_plotting){
fit_ridge_all_RS_par <- data_input[["rsmri"]]
names(tuning_input)<- modality_vec
glmn_tuning_all_RS_par <- tuning_input[["rsmri"]]
coefs_ridge_all <- resp_input %>% map(.,~extract_tibble(fit_ridge_all_RS_par[[.]], alpha_index = paste0("a",glmn_tuning_all_RS_par[[.]][["param_tune"]][["mixture"]])))
coefs_ridge_all <- coefs_ridge_all %>% map(.,~filter(.,pvalue < 0.05) %>%
mutate(.,type = ifelse(type == 'Null', 'Null permuted models', 'Target models'),roi=variable,roi_short= str_remove_all(variable,"^par")))
roi_num_ridge <- coefs_ridge_all %>% map(.,~dim(.)[1])
max_roi_ridge <- max(as.numeric(roi_num_ridge))
coefs_ridge_test <- coefs_ridge_all[[resp_input[1]]] %>% group_by(type)
coefs_ridge_test <- coefs_ridge_test%>% nest(-type)
coefs_ridge_test[[2]][[1]] <- coefs_ridge_test[[2]][[1]] %>% mutate(direction1 = ifelse(coefs_ridge_test[[2]][[1]]$wmean >= median(coefs_ridge_test[[2]][[1]]$wmean)|roi_num_ridge[[resp_input[1]]] <= floor(max_roi_ridge/2),"big","small"))
coefs_ridge_test[[2]][[2]] <- coefs_ridge_test[[2]][[2]] %>% mutate(direction1=coefs_ridge_test[[2]][[1]]$direction1)
coefs_ridge_test <- coefs_ridge_test %>% unnest()
coefs_ridge_all <- coefs_ridge_all%>%map(.,~group_by(.,type))
coefs_ridge_all <- coefs_ridge_all%>%map(.,~nest(.,-type))
for(i in 1:length(resp_input)){
coefs_ridge_all[[resp_input[i]]][["data"]][[2]]<-coefs_ridge_all[[resp_input[i]]][["data"]][[2]] %>% mutate(direction = ifelse(coefs_ridge_all[[resp_input[i]]][["data"]][[2]]$wmean >= median(coefs_ridge_all[[resp_input[i]]][["data"]][[2]]$wmean)|roi_num_ridge[[resp_input[i]]] <= floor(max_roi_ridge/2),"big","small"))
coefs_ridge_all[[resp_input[i]]][["data"]][[1]] <- coefs_ridge_all[[resp_input[i]]][["data"]][[1]] %>% mutate(direction=coefs_ridge_all[[resp_input[i]]][["data"]][[2]]$direction)
}
coefs_ridge_all <- coefs_ridge_all %>%map(.,~unnest(.))
matrix_ridge<- coefs_ridge_all %>% map(.,~filter(., type=="Target models")%>% mutate(.,estimate=wmean)%>%matrix_prepare(.))
for(i in 1:length(resp_input)){
colnames(matrix_ridge[[resp_input[i]]]) <- roi_names_all
rownames(matrix_ridge[[resp_input[i]]])<- roi_names_all
}
matrix_ridge_test <- matrix_ridge%>% map(.,~as.data.frame(.)%>%mutate(roi = roi_names_all)%>%reshape2::melt(.,id.vars="roi"))
method_vec <- c("Enet")
matrix_list <- list(Enet = matrix_ridge_test)
matrix_list_name <- method_vec %>% map(.,function(method_used=.){
matrix_list_name <- resp_input%>%map(.,~mutate(matrix_list[[method_used]][[.]], method=method_used))
# matrix_list_name <- resp_input %>% map(.,~range(matrix_list_name[[method_used]][[.]]$value,na.rm=TRUE)fileDate = '31Dec2020')
return(matrix_list_name)
})
names(matrix_list_name)=method_vec
legend_all <- method_vec %>% map(.,function(method_used=.){
legend_all <- resp_input %>% map(.,~range(matrix_list_name[[method_used]][[.]]$value,na.rm=TRUE))
return(legend_all)
})
names(legend_all)<- method_vec
legend_all_test <- t(as.matrix (as.data.frame(legend_all)))
legend_test <- tibble("low"=legend_all_test[,1],"upp"=legend_all_test[,2],"resp"=resp_input, "method"=c(rep("Elastic Net",length(resp_input))))
legend_resp <- vector("list", length = length(resp_input))
names(legend_resp)<- resp_input
for(i in 1:length(resp_input)){
legend_resp[[resp_input[i]]]<- tibble("upp"= max(legend_test$upp[which(legend_test$resp==resp_input[i])]),
"low"=min(legend_test$low[which(legend_test$resp==resp_input[i])]))
}
resp_input%>% map(.,~ggplot(data =matrix_ridge_test[[.]], aes(x=variable, y=roi, fill=value)) +
geom_tile(color = "white")+
labs(x=NULL,y=NULL,title=paste0("rs-fMRI Predicting\n", plotting_names$longer_name[[which(plotting_names$response==.)]]))+
scale_fill_gradient2(low = muted("blue") , high =muted("red"), mid = "grey80",
midpoint = 0,na.value = "grey50", limit = c(legend_resp[[.]]$low, legend_resp[[.]]$upp), space = "Lab",
name="Parameter\nEstimation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1),
axis.text.y = element_text(angle = 45, vjust = 1, size = 12, hjust = 1),
#axis.text.x = element_blank(),
#axis.text.y = element_blank(),
plot.title = element_text(size=17))+
coord_fixed())
}
enet_interval_plot_rsmri(data_input=enet_tuning_fit,tuning_input=enet_results_all,resp_input = resp_names,plotting_names=resp_var_plotting)
## $TFMRI_NB_ALL_BEH_C2B_RATE
##
## $NIHTBX_PICVOCAB_UNCORRECTED
##
## $NIHTBX_FLANKER_UNCORRECTED
##
## $NIHTBX_PATTERN_UNCORRECTED
##
## $NIHTBX_PICTURE_UNCORRECTED
##
## $NIHTBX_READING_UNCORRECTED
##
## $LMT_SCR_PERC_CORRECT
##
## $PEA_RAVLT_LD_TRIAL_VII_TC
##
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT
enet_interval_plot_DTI <- function(data_input, tuning_input,resp_input,plotting_names=resp_var_plotting){
fit_enet_all_IQR <- data_input[["DTI"]]
names(tuning_input)<- modality_vec
glmn_tuning_all_IQR <- tuning_input[["DTI"]]
coefs_enet_all <- resp_input %>% map(.,~extract_tibble(fit_enet_all_IQR[[.]], alpha_index = paste0("a",glmn_tuning_all_IQR[[.]][["param_tune"]][["mixture"]])))
coefs_enet_all <- coefs_enet_all %>% map(.,~filter(.,pvalue < 0.05) %>%
mutate(.,type = ifelse(type == 'Null', 'Null permuted models', 'Target models'),
roi = str_remove(variable, "FA_")))
roi_num_enet <- coefs_enet_all %>% map(.,~dim(.)[1])
max_roi_enet <- max(as.numeric(roi_num_enet))
coefs_enet_test <- coefs_enet_all[[resp_input[1]]] %>% group_by(type)
coefs_enet_test <- coefs_enet_test%>% nest(-type)
coefs_enet_test[[2]][[1]] <- coefs_enet_test[[2]][[1]] %>% mutate(direction1 = ifelse(coefs_enet_test[[2]][[1]]$wmean >= median(coefs_enet_test[[2]][[1]]$wmean)|roi_num_enet[[resp_input[1]]] <= floor(max_roi_enet/2),"big","small"))
coefs_enet_test[[2]][[2]] <- coefs_enet_test[[2]][[2]] %>% mutate(direction1=coefs_enet_test[[2]][[1]]$direction1)
coefs_enet_test <- coefs_enet_test %>% unnest()
coefs_enet_all <- coefs_enet_all%>%map(.,~group_by(.,type))
coefs_enet_all <- coefs_enet_all%>%map(.,~nest(.,-type))
for(i in 1:length(resp_input)){
coefs_enet_all[[resp_input[i]]][["data"]][[2]]<-coefs_enet_all[[resp_input[i]]][["data"]][[2]] %>% mutate(direction = ifelse(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean >= median(coefs_enet_all[[resp_input[i]]][["data"]][[2]]$wmean)|roi_num_enet[[resp_input[i]]] <= floor(max_roi_enet/2),"big","small"))
coefs_enet_all[[resp_input[i]]][["data"]][[1]] <- coefs_enet_all[[resp_input[i]]][["data"]][[1]] %>% mutate(direction=coefs_enet_all[[resp_input[i]]][["data"]][[2]]$direction)
}
coefs_enet_all <- coefs_enet_all %>%map(.,~unnest(.))
resp_input %>% map(.,~ggplot(coefs_enet_all[[.]], aes(x = fct_reorder(roi, wmean),
y = wmean,
ymax = wmean + 2 * wsd, ymin = wmean - 2 * wsd,
col = type)) +
geom_pointrange(fatten = 0.5, key_glyph = 'point') +
scale_y_continuous(labels = numform::ff_num(zero = 0, digits = 2)) +
scale_color_grey(start = 0.7, end = 0.5) +
coord_flip() +
guides(colour = guide_legend(override.aes = list(size = 2.5)))+
labs(x = 'Explanatory Variables (Brain Regions)', y = 'Averaged Coefficient Across Models (±2 SD)', col = 'Model type',
title = paste0("DTI Predicting\n", plotting_names$longer_name[[which(plotting_names$response==.)]])) +
#facet_wrap(~ direction, scales = 'free_y') +
scale_color_manual(values = c("#56B4E9", "black"),labels = c("Permuted Null", "Target")) +
theme_bw() +
#theme(axis.text.y = element_text(angle = 15)) +
theme(legend.title=element_blank()) +
theme(legend.position = "top") +
theme(
axis.title.x = element_text(size = 15),
axis.text.x = element_text(size = 12),
axis.title.y = element_text(size = 15),
axis.text.y = element_text(size = 12),
legend.text = element_text(size = 15),
plot.title = element_text(size=15)) +
theme(
strip.background = element_blank(),
strip.text.x = element_blank()
)
+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
)
}
enet_interval_plot_DTI(data_input=enet_tuning_fit,
tuning_input=enet_results_all,
resp_input = resp_names,
plotting_names=resp_var_plotting)
## $TFMRI_NB_ALL_BEH_C2B_RATE
##
## $NIHTBX_PICVOCAB_UNCORRECTED
##
## $NIHTBX_FLANKER_UNCORRECTED
##
## $NIHTBX_PATTERN_UNCORRECTED
##
## $NIHTBX_PICTURE_UNCORRECTED
##
## $NIHTBX_READING_UNCORRECTED
##
## $LMT_SCR_PERC_CORRECT
##
## $PEA_RAVLT_LD_TRIAL_VII_TC
##
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT
Her we replace NA from each modality with unlikely small and large values, resulting in 2 predictors for each modality
## the following function just process the data to put into random forest model
rnd_for_data_prep = function(pred_select = "predicted",
data_list,
resp_chose,
resp_data=assessment(baseline_two_fold$splits[[1]]))
{
enet_pred <- plyr::join_all(list(
data_list[["MID_results"]][[resp_chose]][[pred_select]]%>%
dplyr::rename("MID_pred"="pred"),
data_list[["smri_results"]][[resp_chose]][[pred_select]]%>%
dplyr::rename("smri_pred"="pred"),
data_list[["SST_results"]][[resp_chose]][[pred_select]]%>%
dplyr::rename("SST_pred"="pred"),
data_list[["rsmri_results"]][[resp_chose]][[pred_select]]%>%
dplyr::rename("rsmri_pred"="pred"),
data_list[["Nback_results"]][[resp_chose]][[pred_select]]%>%
dplyr::rename("Nback_pred"="pred"),
data_list[["DTI_results"]][[resp_chose]][[pred_select]]%>%
dplyr::rename("DTI_pred"="pred")),
by='SUBJECTKEY', type='full') %>%
distinct(SUBJECTKEY, .keep_all = TRUE)
enet_pred_p <- enet_pred%>% mutate_all(~replace(., is.na(.), 1000))
enet_pred_n <- enet_pred%>% mutate_all(~replace(., is.na(.), -1000))
enet_pred_noNA <- left_join(enet_pred_p,enet_pred_n,by="SUBJECTKEY")
resp_data <- select(resp_data,"SUBJECTKEY",all_of(resp_chose)) %>%
distinct(SUBJECTKEY, .keep_all = TRUE) %>%
drop_na() %>%
mutate_if(is.double, ~ (.x - mean(.x)) / sd(.x))
enet_pred_noNA <- left_join(enet_pred_noNA,resp_data,by="SUBJECTKEY")
enet_pred_noNA_noID <- select(enet_pred_noNA,-SUBJECTKEY)
return(list("ID"=enet_pred_noNA,"NoID"=enet_pred_noNA_noID))
}
enet_rf_valid_data <- resp_names %>%
map(.,~rnd_for_data_prep(data_list = enet_results_all,
resp_chose = all_of(.)))
enet_rf_test_baseline <- resp_names %>%
map(.,~rnd_for_data_prep(pred_select="predicted_baseline",
data_list = enet_results_all,
resp_chose = all_of(.),
resp_data = baseline_test))
enet_rf_test_followup <- resp_names %>%
map(.,~rnd_for_data_prep(pred_select="predicted_followup",
data_list = enet_results_all,
resp_chose = all_of(.),
resp_data = followup_analysis))
stack_datasize_extract <- function(resp_input, data_input){
one_allset <- data.frame(datasplit_number =dim(data_input[[resp_input]][["ID"]])[1],
response=resp_input)
return(one_allset)}
task_stack_valid <- resp_names %>%
map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_valid_data))%>%
do.call(rbind,.)%>%
mutate(number_valid = datasplit_number)%>%
subset(.,select=-c(datasplit_number))
task_stack_baseline <- resp_names %>%
map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_test_baseline))%>%
do.call(rbind,.)%>%
mutate(baseline_test = datasplit_number)%>%
subset(.,select=-c(datasplit_number))
task_stack_followup <- resp_names %>%
map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_test_followup))%>%
do.call(rbind,.)%>%
mutate(followup_test = datasplit_number)%>%
subset(.,select=-c(datasplit_number))
plyr::join_all(list(task_stack_valid,task_stack_baseline,task_stack_followup,resp_var_plotting),
type="left", by = "response")%>%
select("short_name","number_valid",ends_with("test"))%>%
kableExtra::kbl(caption = paste0("Sample size of all the data sets in stacked models ")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
short_name | number_valid | baseline_test | followup_test |
---|---|---|---|
2-back Work Mem | 2480 | 4597 | 4902 |
Pic Vocab | 2812 | 5240 | 5282 |
Flanker | 2810 | 5235 | 5372 |
Pattern Speed | 2804 | 5229 | 5351 |
Seq Memory | 2809 | 5235 | 5368 |
Reading Recog | 2809 | 5220 | 5269 |
Little Man | 2808 | 5093 | 5458 |
Audi Verbal | 2793 | 5182 | 5350 |
integration SSRT | 1936 | 3725 | 4053 |
here we specified the 2nd layer model
random_forest_tuning <- function(resp,valid_list,baseline_test, followup_test){
rf_train_data <- valid_list[[resp]]
rf_test_baseline <- baseline_test[[resp]]
rf_test_followup <- followup_test[[resp]]
### to avoid repetition in coding, data frame lists are created with ID and without ID.
resp_var <- resp
##data recipe, assign "SUBJECTKEY" to ID (a class of variables neither belong to predictors nor response variable)
## do not scale the input data as 1000 and -1000 are put there on purpose
forest_rec <- recipe(as.formula(paste0(resp_var, "~ .")),
data = rf_train_data[["NoID"]]) %>%
step_dummy(all_nominal()) %>%
prep(training = rf_train_data[["NoID"]], retain = TRUE)
## mtry is the number of predictors to sample at each split
## min_n (the number of observations needed to keep splitting nodes)
tune_spec <-rand_forest(mtry = tune(),
trees = 1000,
min_n = tune()) %>%
set_mode("regression") %>%
set_engine("ranger")
tune_wf <- workflow() %>%
add_recipe(forest_rec) %>%
add_model(tune_spec)
set.seed(123456)
## nested data validation
all_rs <- rsample::vfold_cv(data=rf_train_data[["ID"]],v = 10,repeats = 1)
## automate generate grid for hyperparameters
#doParallel::registerDoParallel()
rf_grid <- grid_regular(
mtry(range = c(1, 12)), #12 is the highest number of predictors
min_n(range = c(1, 1000)),
levels = c(mtry = 12, min_n = 101) #mtry = 10 so that it increases one step at a time
)
tune_res <- tune_grid(
tune_wf,
resamples = all_rs,
metrics = metric_set(rmse),
grid = rf_grid
)
best_tune <- select_best(tune_res, metric = "rmse")
final_rf <- finalize_model(
tune_spec,
best_tune
)
forest_prep <- prep(forest_rec)
## this one fit the random forest model with the best tuned parameters
rf_fit <- final_rf %>%
set_engine("ranger",importance="permutation") %>%
parsnip::fit(as.formula(paste0(resp_var, "~ .")),
data = juice(forest_prep)
)
pred_baseline <- predict(rf_fit,new_data = rf_test_baseline[["NoID"]])
pred_followup <- predict(rf_fit,new_data = rf_test_followup[["NoID"]])
performance_baseline <- cor(pred_baseline$.pred,rf_test_baseline[["ID"]][[resp]])
performance_followup <- cor(pred_followup$.pred,rf_test_followup[["ID"]][[resp]])
best_tune <- best_tune %>% mutate(performance_baseline=performance_baseline,
performance_followup=performance_followup)
result_list <- list("param_tune"=best_tune,
"predicted_baseline"=tibble(SUBJECTKEY=rf_test_baseline[["ID"]]$SUBJECTKEY,
"pred_baseline"=pred_baseline$.pred),
"predicted_followup"=tibble(SUBJECTKEY=rf_test_followup[["ID"]]$SUBJECTKEY,
"pred_followup"=pred_followup$.pred),
"model_fit"=rf_fit)
return(result_list)
}
Here we actually put all of the fitted results into the random forest. It is time-consuming and computing-expensive to run, so not run by default.
##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)
rf_fit_all <- resp_names %>%
future_map(.,~random_forest_tuning(resp=.,
valid_list = enet_rf_valid_data,
baseline_test = enet_rf_test_baseline,
followup_test = enet_rf_test_followup),
.options = furrr_options(seed=TRUE),.progress = TRUE)
random_fores_output_print <-
function(rf_output,enet_output,response_vec,names_plotting){
summary_rf <- vector("list", length = length(response_vec))
names(summary_rf) <- response_vec
for(i in 1:length(response_vec)){
summary_rf[[response_vec[i]]]<- as_tibble(rf_output[[response_vec[i]]][["param_tune"]]) %>% slice(1)
}
## mtry is the number of predictors to sample at each split
## min_n (the number of observations needed to keep splitting nodes)
rf_perform_output <- summary_rf %>% bind_rows() %>%
mutate(respones = names_plotting$short_name)%>%
rename(., "#predictors/split"=mtry,
"#observations/split"=min_n,
"model#"=.config,
"Baseline (Pearson r)"=performance_baseline,
"Followup (Pearson r)"=performance_followup)
rf_result_vec <- paste0(mri_vec,"_results")
all_enet_performance <- response_vec %>%
map(.,function(resp){
summary_mri_all <- vector("list", length = length(rf_result_vec))
names(summary_mri_all)<-rf_result_vec
for(i in 1:length(rf_result_vec)){
summary_mri_all[[rf_result_vec[i]]]<-
as_tibble(enet_output[[rf_result_vec[i]]][[resp]][["param_tune"]]) %>% slice(1)
}
summary_mri_all <- summary_mri_all %>%
bind_rows() %>%
mutate(type = mri_vec)
return(summary_mri_all)
})
summary_rf_all <- summary_rf %>% bind_rows() %>% mutate(response=response_vec)
all_enet_performance_baseline <- all_enet_performance %>%
do.call(rbind,.) %>%
mutate(.,response = rep(response_vec, each=length(mri_vec))) %>%
select(type,baseline_performance,response) %>%
pivot_wider(names_from = type, values_from= c(baseline_performance))
all_enet_baseline <-
plyr::join_all(list(all_enet_performance_baseline,names_plotting,summary_rf_all),
type="left", by = "response") %>%
select(all_of(mri_vec),"short_name","performance_baseline")%>%
rename(random_forest_baseline = performance_baseline)
all_enet_performance_followup <- all_enet_performance%>% do.call(rbind,.) %>%
mutate(.,response = rep(response_vec, each=length(mri_vec))) %>%
select(type,followup_performance,response) %>%
pivot_wider(names_from = type, values_from= c(followup_performance))
all_enet_followup <-
plyr::join_all(list(all_enet_performance_followup,names_plotting,summary_rf_all),
type="left", by = "response") %>%
select(all_of(mri_vec),"short_name","performance_followup")%>%
rename(random_forest_followup = performance_followup)
return(list(rf_performance = rf_perform_output,rf_enet_baseline = all_enet_baseline,rf_enet_followup = all_enet_followup))
}
task_rf_results <- random_fores_output_print(rf_output = rf_fit_all,
enet_output = enet_results_all,
response_vec=resp_names,
names_plotting = resp_var_plotting)
task_rf_results[["rf_performance"]]%>%
rename("Response Variable" = respones, "Baseline" = "Baseline (Pearson r)", "Followup" = "Followup (Pearson r)") %>%
select("Response Variable", "Baseline","Followup", "#predictors/split", "#observations/split") %>%
mutate(across(2:3, round, 3)) %>%
kableExtra::kbl(caption = paste0("Performance (Pearson r) and Hyperparameters of Random Forest Stacking for Each Cognitive Task")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
Response Variable | Baseline | Followup | #predictors/split | #observations/split |
---|---|---|---|---|
2-back Work Mem | 0.478 | 0.417 | 3 | 140 |
Pic Vocab | 0.364 | 0.379 | 4 | 150 |
Flanker | 0.183 | 0.199 | 4 | 490 |
Pattern Speed | 0.193 | 0.189 | 2 | 420 |
Seq Memory | 0.237 | 0.247 | 4 | 410 |
Reading Recog | 0.311 | 0.311 | 12 | 280 |
Little Man | 0.269 | 0.264 | 3 | 270 |
Audi Verbal | 0.239 | 0.182 | 3 | 590 |
integration SSRT | 0.292 | 0.333 | 6 | 350 |
task_rf_results[["rf_enet_baseline"]] %>%
rename("Response Variable" = short_name,
"NBack-fMRI" = Nback,
"SST-fMRI" = SST,
"MID-fMRI" = MID,
"rs-fMRI" = rsmri,
"sMRI" = smri,
"Stacked" = random_forest_baseline) %>%
select("Response Variable",
"Stacked",
"NBack-fMRI",
"MID-fMRI",
"SST-fMRI",
"rs-fMRI",
"sMRI",
"DTI") %>%
mutate(across(2:8, round, 3)) %>%
kableExtra::kbl(caption = paste0("Performance of Random Forest Stacking VS Elastic Net on Each Modality for Each Cognitive Task\n at the Baseline (Pearson r)")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
Response Variable | Stacked | NBack-fMRI | MID-fMRI | SST-fMRI | rs-fMRI | sMRI | DTI |
---|---|---|---|---|---|---|---|
2-back Work Mem | 0.478 | 0.492 | 0.154 | 0.175 | 0.125 | 0.140 | 0.130 |
Pic Vocab | 0.364 | 0.318 | 0.174 | 0.124 | 0.213 | 0.184 | 0.178 |
Flanker | 0.183 | 0.199 | 0.065 | 0.018 | 0.057 | 0.033 | 0.079 |
Pattern Speed | 0.193 | 0.182 | 0.080 | 0.032 | 0.087 | 0.090 | 0.121 |
Seq Memory | 0.237 | 0.210 | 0.102 | NA | 0.095 | 0.112 | 0.120 |
Reading Recog | 0.311 | 0.289 | 0.157 | 0.102 | 0.189 | 0.166 | 0.138 |
Little Man | 0.269 | 0.243 | 0.079 | 0.095 | 0.114 | 0.131 | 0.116 |
Audi Verbal | 0.239 | 0.151 | 0.093 | 0.076 | 0.124 | 0.094 | 0.098 |
integration SSRT | 0.292 | 0.122 | 0.015 | 0.328 | 0.031 | 0.067 | 0.086 |
task_rf_results[["rf_enet_followup"]] %>%
rename("Response Variable" = short_name,
"NBack-fMRI" = Nback,
"SST-fMRI" = SST,
"MID-fMRI" = MID,
"rs-fMRI" = rsmri,
"sMRI" = smri,
"Stacked" = random_forest_followup) %>%
select("Response Variable",
"Stacked",
"NBack-fMRI",
"MID-fMRI",
"SST-fMRI",
"rs-fMRI",
"sMRI",
"DTI") %>%
mutate(across(2:8, round, 3)) %>%
kableExtra::kbl(caption = paste0("Performance of Random Forest Stacking VS Elastic Net on Each Modality for Each Cognitive Task\n at the Follow-Up (Pearson r)")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
Response Variable | Stacked | NBack-fMRI | MID-fMRI | SST-fMRI | rs-fMRI | sMRI | DTI |
---|---|---|---|---|---|---|---|
2-back Work Mem | 0.417 | 0.480 | 0.132 | 0.195 | 0.185 | 0.177 | 0.199 |
Pic Vocab | 0.379 | 0.298 | 0.128 | 0.147 | 0.251 | 0.226 | 0.190 |
Flanker | 0.199 | 0.174 | 0.044 | 0.050 | 0.093 | 0.058 | 0.141 |
Pattern Speed | 0.189 | 0.193 | 0.063 | 0.048 | 0.090 | 0.102 | 0.093 |
Seq Memory | 0.247 | 0.228 | 0.098 | NA | 0.127 | 0.128 | 0.169 |
Reading Recog | 0.311 | 0.284 | 0.126 | 0.092 | 0.205 | 0.174 | 0.151 |
Little Man | 0.264 | 0.237 | 0.058 | 0.082 | 0.097 | 0.118 | 0.246 |
Audi Verbal | 0.182 | 0.156 | 0.046 | 0.080 | 0.126 | 0.091 | 0.062 |
integration SSRT | 0.333 | 0.160 | 0.017 | 0.371 | 0.086 | 0.077 | 0.120 |
the results should be treated with caution as the opportunistic stacking includes inherently colineared predictors.
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
resp_names%>% map(.,~rf_fit_all[[.]][["model_fit"]] %>%
vip(geom = "point"))
## $TFMRI_NB_ALL_BEH_C2B_RATE
##
## $NIHTBX_PICVOCAB_UNCORRECTED
##
## $NIHTBX_FLANKER_UNCORRECTED
##
## $NIHTBX_PATTERN_UNCORRECTED
##
## $NIHTBX_PICTURE_UNCORRECTED
##
## $NIHTBX_READING_UNCORRECTED
##
## $LMT_SCR_PERC_CORRECT
##
## $PEA_RAVLT_LD_TRIAL_VII_TC
##
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT
enet_rf_valid_data_permu <- enet_rf_valid_data %>% map(.,function(data_input=.){
data_output<- data_input[["NoID"]] %>%
dplyr::rename("MID_large"="MID_pred.x",
"SST_large"="SST_pred.x",
"smri_large"="smri_pred.x",
"rsmri_large"="rsmri_pred.x",
"DTI_large"="DTI_pred.x",
"Nback_large"="Nback_pred.x",
"MID_small"="MID_pred.y",
"SST_small"="SST_pred.y",
"smri_small"="smri_pred.y",
"rsmri_small"="rsmri_pred.y",
"DTI_small"="DTI_pred.y",
"Nback_small"="Nback_pred.y")
return(data_output)
})
cf_fit <- resp_names %>% map(.,function(resp_var=.){
cforest(as.formula(paste0(resp_var, "~ .")),
data=enet_rf_valid_data_permu[[resp_var]],
control= cforest_unbiased(mtry=rf_fit_all[[resp_var]][["param_tune"]]$mtry,
ntree=1000,minsplit=rf_fit_all[[resp_var]][["param_tune"]]$min_n))
})
PI_permimp <-cf_fit %>% map(.,~permimp(.,progressBar = FALSE))
resp_names %>% map(.,~plot(PI_permimp[[.]],
type = "box",
horizontal = TRUE,
main=paste0("Conditional Permutation Importance ",
resp_var_plotting$short_name[[which(resp_var_plotting$response==.)]])))
## $TFMRI_NB_ALL_BEH_C2B_RATE
## NULL
##
## $NIHTBX_PICVOCAB_UNCORRECTED
## NULL
##
## $NIHTBX_FLANKER_UNCORRECTED
## NULL
##
## $NIHTBX_PATTERN_UNCORRECTED
## NULL
##
## $NIHTBX_PICTURE_UNCORRECTED
## NULL
##
## $NIHTBX_READING_UNCORRECTED
## NULL
##
## $LMT_SCR_PERC_CORRECT
## NULL
##
## $PEA_RAVLT_LD_TRIAL_VII_TC
## NULL
##
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT
## NULL
resp_names %>% map(.,~plot(PI_permimp[[.]],
type = "bar",
interval="quantile",
main=paste0("Conditional Permutation Importance ",
resp_var_plotting$short_name[[which(resp_var_plotting$response==.)]])))
## $TFMRI_NB_ALL_BEH_C2B_RATE
## NULL
##
## $NIHTBX_PICVOCAB_UNCORRECTED
## NULL
##
## $NIHTBX_FLANKER_UNCORRECTED
## NULL
##
## $NIHTBX_PATTERN_UNCORRECTED
## NULL
##
## $NIHTBX_PICTURE_UNCORRECTED
## NULL
##
## $NIHTBX_READING_UNCORRECTED
## NULL
##
## $LMT_SCR_PERC_CORRECT
## NULL
##
## $PEA_RAVLT_LD_TRIAL_VII_TC
## NULL
##
## $TFMRI_SST_ALL_BEH_TOTAL_ISSRT
## NULL
random_forest_plotting_followup <- function(rf_fit, followup_data, resp_vec, naming_frame ){
followup_rf_pred <- resp_vec %>% map(.,function(resp_var=.){
resp_pred <- tibble(pred = rf_fit[[resp_var]][["predicted_followup"]][["pred_followup"]],
observed=followup_data[[resp_var]][["ID"]][[resp_var]],
SUBJECTKEY=followup_data[[resp_var]][["ID"]][["SUBJECTKEY"]])
return(resp_pred)
})
task_followup_pred <- resp_vec %>%
map(., ~ggplot(followup_rf_pred[[.]],
aes(x = scale(pred) ,
y = scale(observed))) +
geom_jitter(height = 0.1, width = 0.1, size = 1, col = 'grey60') +
geom_smooth(method = 'lm', se = FALSE, col = 'black') +
labs(x = NULL,
y = NULL,
title = paste (naming_frame$short_name[[which(naming_frame$response==.)]],
'\nr = ',round(rf_fit[[.]][["param_tune"]][["performance_followup"]], 3)))
)
title_task_rf_pred_plot <- ggdraw() +
draw_label(
"Out-of-Sample Random Forest Stacking Predictive Ability at Followup",
fontface = 'bold',
x = 0,
hjust = 0
) +
theme(
# add margin on the left of the drawing canvas,
# so title is aligned with left edge of first plot
plot.margin = margin(0, 0, 0, 7)
)
followup_pred_all_figure<- plot_grid(title_task_rf_pred_plot,plot_grid(plotlist = task_followup_pred),nrow = 2 , rel_heights = c(0.1, 1))
ggpubr::annotate_figure(followup_pred_all_figure,left= ggpubr::text_grob("Observed Cognitive Performance (Z)",size=15,rot=90),
bottom = ggpubr::text_grob("Predicted Cognitive Performance (Z)",size=15))
}
random_forest_plotting_baseline <- function(rf_fit, baseline_data, resp_vec,naming_frame ){
baseline_rf_pred <- resp_vec %>% map(.,function(resp_var=.){
resp_pred <- tibble(pred = rf_fit[[resp_var]][["predicted_baseline"]][["pred_baseline"]],
observed=baseline_data[[resp_var]][["ID"]][[resp_var]],
SUBJECTKEY=baseline_data[[resp_var]][["ID"]][["SUBJECTKEY"]])
return(resp_pred)
})
task_baseline_pred <- resp_vec %>% map(., ~ggplot(baseline_rf_pred[[.]],aes(x = scale(pred) , y = scale(observed))) +
geom_jitter(height = 0.1, width = 0.1, size = 1, col = 'grey60') +
geom_smooth(method = 'lm', se = FALSE, col = 'black') +
labs(x = NULL,
y = NULL,
title = paste (naming_frame$short_name[[which(naming_frame$response==.)]],'\nr = ',round(rf_fit[[.]][["param_tune"]][["performance_baseline"]], 3)))
)
title_task_rf_pred_plot_baseline <- ggdraw() +
draw_label(
"Out-of-Sample Random Forest Stacking Predictive Ability at Baseline",
fontface = 'bold',
x = 0,
hjust = 0
) +
theme(
# add margin on the left of the drawing canvas,
# so title is aligned with left edge of first plot
plot.margin = margin(0, 0, 0, 7)
)
baseline_pred_all_figure<- plot_grid(title_task_rf_pred_plot_baseline,
plot_grid(plotlist = task_baseline_pred),
nrow = 2 ,
rel_heights = c(0.1, 1))
ggpubr::annotate_figure(baseline_pred_all_figure,
left= ggpubr::text_grob("Observed Cognitive Performance (Z)",size=15,rot=90),
bottom = ggpubr::text_grob("Predicted Cognitive Performance (Z)",size=15))
}
random_forest_plotting_followup(rf_fit =rf_fit_all,followup_data=enet_rf_test_followup,naming_frame = resp_var_plotting, resp_vec= resp_names)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
random_forest_plotting_baseline(rf_fit =rf_fit_all,baseline_data=enet_rf_test_baseline,naming_frame = resp_var_plotting , resp_vec= resp_names)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
here cfa is used to compute g factor.
We fit two models (first order and second order models) onto the
training data, and apply the fit to the validation (2nd-layer) and test
sets (baseline and followup). This is to protect us from
dataleakage.
# we only inlcuded task that are avaliable on two time points
TaskDVs1Batch = c("NIHTBX_PICVOCAB_UNCORRECTED",
"NIHTBX_READING_UNCORRECTED",
"NIHTBX_FLANKER_UNCORRECTED",
"NIHTBX_PATTERN_UNCORRECTED",
"NIHTBX_PICTURE_UNCORRECTED",
"PEA_RAVLT_LD_TRIAL_VII_TC"
)
CFA_lav_output_processing <- function(data_input,CFA_model1,CFA_model2,CFA_model3){
first_order_output <- lavaan::lavPredict(CFA_model1, newdata =data_input )
second_order_output <- lavaan::lavPredict(CFA_model2, newdata = data_input)
single_factor_output <- lavaan::lavPredict(CFA_model3, newdata = data_input)
### getting SUBJECTKEY for CFA model prediction
### the number of columns of CFA predictions and task performance data matched
task_data <- select(data_input,all_of(TaskDVs1Batch),"SUBJECTKEY")%>% drop_na()
CFA_lav <- tibble(SUBJECTKEY =task_data$SUBJECTKEY,
Lang_first_order = first_order_output[,1],
CogFlex_first_order = first_order_output[,2],
MemRe_first_order = first_order_output[,3],
g_second_order =second_order_output[,4],
g_single_factor =single_factor_output[,1])
return(CFA_lav)
}
CFA_lav_response <- function(data_input = baseline_two_fold$splits[[1]],
test_baseline =baseline_test,
test_followup=followup_analysis ){
## training the model and for enet parameter tuning
data_train <- analysis(data_input)%>%
select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
## random forest tuning
data_valid <- assessment(data_input)%>%
select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
## test set for baseline observations
data_baseline <- test_baseline%>%
select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
## test set for followup observations
data_followup <- test_followup%>%
select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
### first order CFA model
NeuroCog1stOrder <-'
Language =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED
CognitiveFlexibity =~ NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED
MemoryRecall =~ NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC'
NeuroCog1stOrder.Fit <- lavaan::cfa(model = NeuroCog1stOrder, data = data_train,estimator="MLR")
### second order CFA model
NeuroCog2ndOrder <-'
Language =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED
CognitiveFlexibity =~ NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED
MemoryRecall =~ NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC
g =~ NA*Language + CognitiveFlexibity + MemoryRecall #estimate the loading of GenAbi -> as opposed to using it as a marker
g ~~ 1*g #need to constrain variance to 1'
NeuroCog2ndOrder.Fit <- lavaan::cfa(model = NeuroCog2ndOrder, data = data_train,estimator="MLR")
### single factor CFA model
NeuroCogSingleFactor <-'
g =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED + NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED + NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC'
NeuroCogSingleFactor.Fit <- lavaan::cfa(model = NeuroCogSingleFactor, data = data_train,estimator="MLR")
train_CFA_lav <- CFA_lav_output_processing(data_input = data_train,
CFA_model1 = NeuroCog1stOrder.Fit,
CFA_model2 = NeuroCog2ndOrder.Fit,
CFA_model3 = NeuroCogSingleFactor.Fit)
valid_CFA_lav <- CFA_lav_output_processing(data_input = data_valid,
CFA_model1 = NeuroCog1stOrder.Fit,
CFA_model2 = NeuroCog2ndOrder.Fit,
CFA_model3 = NeuroCogSingleFactor.Fit)
baseline_CFA_lav <- CFA_lav_output_processing(data_input = data_baseline,
CFA_model1 = NeuroCog1stOrder.Fit,
CFA_model2 = NeuroCog2ndOrder.Fit,
CFA_model3 = NeuroCogSingleFactor.Fit)
followup_CFA_lav <- CFA_lav_output_processing(data_input = data_followup,
CFA_model1 = NeuroCog1stOrder.Fit,
CFA_model2 = NeuroCog2ndOrder.Fit,
CFA_model3 = NeuroCogSingleFactor.Fit)
output_list <- list(enet_tuning = train_CFA_lav,
RanFor_tuning = valid_CFA_lav,
testing_baseline = baseline_CFA_lav,
testing_followup=followup_CFA_lav)
return(list(output_list, NeuroCog1stOrder.Fit, NeuroCog2ndOrder.Fit, NeuroCogSingleFactor.Fit))
}
CFA_list <- CFA_lav_response(data_input = baseline_two_fold$splits[[1]],
test_baseline =baseline_test,
test_followup=followup_analysis)
CFA_response <- CFA_list[[1]]
NeuroCog1stOrder.Fit <- CFA_list[[2]]
NeuroCog2ndOrder.Fit <- CFA_list[[3]]
NeuroCogSingleFactor.Fit <- CFA_list[[4]]
the result is 3
data_train_fa <- analysis(baseline_two_fold$splits[[1]])%>%
select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
data_train_fa %>% select(all_of(TaskDVs1Batch)) %>%
psych::fa.parallel(fm='ml', fa='fa')
## Parallel analysis suggests that the number of factors = 3 and the number of components = NA
Task.faObli <-
data_train_fa %>%
select(all_of(TaskDVs1Batch)) %>%
psych::fa(nfactors = 3, #number of factors
rotate = "oblimin", #rotation type
fm = "ml", #math
score = T)
## Loading required namespace: GPArotation
print(Task.faObli$loadings,
#cut =.3,
sort=TRUE)
##
## Loadings:
## ML1 ML3 ML2
## NIHTBX_PICVOCAB_UNCORRECTED 0.702
## NIHTBX_READING_UNCORRECTED 0.730
## NIHTBX_PICTURE_UNCORRECTED 0.688
## PEA_RAVLT_LD_TRIAL_VII_TC 0.129 0.575
## NIHTBX_FLANKER_UNCORRECTED 0.147 0.534
## NIHTBX_PATTERN_UNCORRECTED 0.679
##
## ML1 ML3 ML2
## SS loadings 1.070 0.805 0.750
## Proportion Var 0.178 0.134 0.125
## Cumulative Var 0.178 0.313 0.437
Task.faObli
## Factor Analysis using method = ml
## Call: psych::fa(r = ., nfactors = 3, rotate = "oblimin", scores = T,
## fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
## ML1 ML3 ML2 h2 u2 com
## NIHTBX_PICVOCAB_UNCORRECTED 0.70 0.01 0.01 0.51 0.49 1.0
## NIHTBX_READING_UNCORRECTED 0.73 0.01 0.01 0.55 0.45 1.0
## NIHTBX_FLANKER_UNCORRECTED 0.15 -0.01 0.53 0.37 0.63 1.2
## NIHTBX_PATTERN_UNCORRECTED -0.05 0.02 0.68 0.44 0.56 1.0
## NIHTBX_PICTURE_UNCORRECTED -0.06 0.69 0.04 0.45 0.55 1.0
## PEA_RAVLT_LD_TRIAL_VII_TC 0.13 0.58 -0.04 0.41 0.59 1.1
##
## ML1 ML3 ML2
## SS loadings 1.12 0.84 0.78
## Proportion Var 0.19 0.14 0.13
## Cumulative Var 0.19 0.33 0.46
## Proportion Explained 0.41 0.31 0.28
## Cumulative Proportion 0.41 0.72 1.00
##
## With factor correlations of
## ML1 ML3 ML2
## ML1 1.00 0.58 0.43
## ML3 0.58 1.00 0.42
## ML2 0.43 0.42 1.00
##
## Mean item complexity = 1
## Test of the hypothesis that 3 factors are sufficient.
##
## The degrees of freedom for the null model are 15 and the objective function was 1.02 with Chi Square of 2945.66
## The degrees of freedom for the model are 0 and the objective function was 0
##
## The root mean square of the residuals (RMSR) is 0
## The df corrected root mean square of the residuals is NA
##
## The harmonic number of observations is 2902 with the empirical chi square 0.01 with prob < NA
## The total number of observations was 2902 with Likelihood Chi Square = 0.01 with prob < NA
##
## Tucker Lewis Index of factoring reliability = -Inf
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## ML1 ML3 ML2
## Correlation of (regression) scores with factors 0.85 0.80 0.78
## Multiple R square of scores with factors 0.73 0.65 0.60
## Minimum correlation of possible factor scores 0.45 0.30 0.21
## training the model and for enet parameter tuning
data_train <- analysis(baseline_two_fold$splits[[1]])%>%
select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
## random forest tuning
data_valid <- assessment(baseline_two_fold$splits[[1]])%>%
select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
## test set for baseline observations
data_baseline <- baseline_test%>%
select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
## test set for followup observations
data_followup <- followup_analysis%>%
select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
Task.faObli_data_train_pred <- psych::predict.psych(object = Task.faObli,
data = data_train %>% select(all_of(TaskDVs1Batch)),
old.data = data_train %>% select(all_of(TaskDVs1Batch))) %>%
as_tibble() %>%
rename(Language_EFA = ML1,
CognitiveFlexibity_EFA = ML2,
MemoryRecall_EFA = ML3)
NeuroCogEFA_CFA <-'
g =~ Language_EFA + MemoryRecall_EFA + CognitiveFlexibity_EFA'
NeuroCogEFA_CFA.Fit <- lavaan::cfa(model = NeuroCogEFA_CFA,
data = Task.faObli_data_train_pred,
estimator="MLR")
CFA_response$enet_tuning <-
CFA_response$enet_tuning %>% bind_cols(
lavaan::lavPredict(NeuroCogEFA_CFA.Fit,
newdata =Task.faObli_data_train_pred)%>%
as.double() %>% as_tibble() %>%
rename(g_EFA_CFA = value))
## data valid
Task.faObli_data_valid_pred <- psych::predict.psych(object = Task.faObli,
data = data_valid %>% select(all_of(TaskDVs1Batch)),
old.data = data_train %>% select(all_of(TaskDVs1Batch))) %>%
as_tibble() %>%
rename(Language_EFA = ML1,
CognitiveFlexibity_EFA = ML2,
MemoryRecall_EFA = ML3)
CFA_response$RanFor_tuning <-
CFA_response$RanFor_tuning %>% bind_cols(
lavaan::lavPredict(NeuroCogEFA_CFA.Fit,
newdata =Task.faObli_data_valid_pred)%>%
as.double() %>% as_tibble() %>%
rename(g_EFA_CFA = value))
## data_baseline
Task.faObli_data_baseline_pred <- psych::predict.psych(object = Task.faObli,
data = data_baseline %>% select(all_of(TaskDVs1Batch)),
old.data = data_train %>% select(all_of(TaskDVs1Batch))) %>%
as_tibble() %>%
rename(Language_EFA = ML1,
CognitiveFlexibity_EFA = ML2,
MemoryRecall_EFA = ML3)
CFA_response$testing_baseline <-
CFA_response$testing_baseline %>% bind_cols(
lavaan::lavPredict(NeuroCogEFA_CFA.Fit,
newdata =Task.faObli_data_baseline_pred)%>%
as.double() %>% as_tibble() %>%
rename(g_EFA_CFA = value))
## data_follow up
Task.faObli_data_followup_pred <- psych::predict.psych(object = Task.faObli,
data = data_followup %>% select(all_of(TaskDVs1Batch)),
old.data = data_train %>% select(all_of(TaskDVs1Batch))) %>%
as_tibble() %>%
rename(Language_EFA = ML1,
CognitiveFlexibity_EFA = ML2,
MemoryRecall_EFA = ML3)
CFA_response$testing_followup <-
CFA_response$testing_followup %>% bind_cols(
lavaan::lavPredict(NeuroCogEFA_CFA.Fit,
newdata =Task.faObli_data_followup_pred)%>%
as.double() %>% as_tibble() %>%
rename(g_EFA_CFA = value))
lavaan::summary(NeuroCogEFA_CFA.Fit, standardized = TRUE, rsquare = TRUE, fit.measures = TRUE)
## lavaan 0.6-11 ended normally after 19 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of model parameters 6
##
## Number of observations 2902
##
## Model Test User Model:
## Standard Robust
## Test Statistic 0.000 0.000
## Degrees of freedom 0 0
##
## Model Test Baseline Model:
##
## Test statistic 3882.254 3837.255
## Degrees of freedom 3 3
## P-value 0.000 0.000
## Scaling correction factor 1.012
##
## User Model versus Baseline Model:
##
## Comparative Fit Index (CFI) 1.000 1.000
## Tucker-Lewis Index (TLI) 1.000 1.000
##
## Robust Comparative Fit Index (CFI) 1.000
## Robust Tucker-Lewis Index (TLI) 1.000
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -8584.296 -8584.296
## Loglikelihood unrestricted model (H1) -8584.296 -8584.296
##
## Akaike (AIC) 17180.591 17180.591
## Bayesian (BIC) 17216.430 17216.430
## Sample-size adjusted Bayesian (BIC) 17197.366 17197.366
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.000 0.000
## 90 Percent confidence interval - lower 0.000 0.000
## 90 Percent confidence interval - upper 0.000 0.000
## P-value RMSEA <= 0.05 NA NA
##
## Robust RMSEA 0.000
## 90 Percent confidence interval - lower 0.000
## 90 Percent confidence interval - upper 0.000
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.000 0.000
##
## Parameter Estimates:
##
## Standard errors Sandwich
## Information bread Observed
## Observed information based on Hessian
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## g =~
## Language_EFA 1.000 0.741 0.870
## MemoryRcll_EFA 0.933 0.021 44.922 0.000 0.692 0.860
## CgntvFlxbt_EFA 0.720 0.019 37.065 0.000 0.534 0.687
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .Language_EFA 0.177 0.011 16.639 0.000 0.177 0.244
## .MemoryRcll_EFA 0.169 0.009 17.908 0.000 0.169 0.261
## .CgntvFlxbt_EFA 0.318 0.010 32.873 0.000 0.318 0.528
## g 0.550 0.022 25.501 0.000 1.000 1.000
##
## R-Square:
## Estimate
## Language_EFA 0.756
## MemoryRcll_EFA 0.739
## CgntvFlxbt_EFA 0.472
Plabels = c("Language","Memory\nRecall","Mental\nFlexibity",expression(paste(italic("g"))))
semPlot::semPaths(object=NeuroCogEFA_CFA.Fit,
intercepts = F, residuals = F,
whatLabels="no",
what = "std",
layout="tree",
node.width = 1.4,
edge.label.cex = .6,
nodeLabels=Plabels,
edge.color="black",
sizeMan = 10, sizeLat = 10)
#semPlot::semPaths(NeuroCogEFA_CFA.Fit)
colnames(Task.faObli$loadings) <- c("Language","Memory\nRecall","Mental\nFlexibity")
rownames(Task.faObli$loadings) <-
c("Vocab","Reading","Flanker","Pattern","Picture","RAVLT")
psych::fa.diagram(Task.faObli)
Task.faObli
## Factor Analysis using method = ml
## Call: psych::fa(r = ., nfactors = 3, rotate = "oblimin", scores = T,
## fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
## Language Memory\nRecall Mental\nFlexibity h2 u2 com
## Vocab 0.70 0.01 0.01 0.51 0.49 1.0
## Reading 0.73 0.01 0.01 0.55 0.45 1.0
## Flanker 0.15 -0.01 0.53 0.37 0.63 1.2
## Pattern -0.05 0.02 0.68 0.44 0.56 1.0
## Picture -0.06 0.69 0.04 0.45 0.55 1.0
## RAVLT 0.13 0.58 -0.04 0.41 0.59 1.1
##
## Language Memory\nRecall Mental\nFlexibity
## SS loadings 1.12 0.84 0.78
## Proportion Var 0.19 0.14 0.13
## Cumulative Var 0.19 0.33 0.46
## Proportion Explained 0.41 0.31 0.28
## Cumulative Proportion 0.41 0.72 1.00
##
## With factor correlations of
## Language Memory\nRecall Mental\nFlexibity
## Language 1.00 0.58 0.43
## Memory\nRecall 0.58 1.00 0.42
## Mental\nFlexibity 0.43 0.42 1.00
##
## Mean item complexity = 1
## Test of the hypothesis that 3 factors are sufficient.
##
## The degrees of freedom for the null model are 15 and the objective function was 1.02 with Chi Square of 2945.66
## The degrees of freedom for the model are 0 and the objective function was 0
##
## The root mean square of the residuals (RMSR) is 0
## The df corrected root mean square of the residuals is NA
##
## The harmonic number of observations is 2902 with the empirical chi square 0.01 with prob < NA
## The total number of observations was 2902 with Likelihood Chi Square = 0.01 with prob < NA
##
## Tucker Lewis Index of factoring reliability = -Inf
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## Language Memory\nRecall
## Correlation of (regression) scores with factors 0.85 0.80
## Multiple R square of scores with factors 0.73 0.65
## Minimum correlation of possible factor scores 0.45 0.30
## Mental\nFlexibity
## Correlation of (regression) scores with factors 0.78
## Multiple R square of scores with factors 0.60
## Minimum correlation of possible factor scores 0.21
lavaan::summary(NeuroCog1stOrder.Fit, standardized = TRUE, rsquare = TRUE, fit.measures = TRUE)
## lavaan 0.6-11 ended normally after 31 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of model parameters 15
##
## Number of observations 2902
##
## Model Test User Model:
## Standard Robust
## Test Statistic 20.462 20.029
## Degrees of freedom 6 6
## P-value (Chi-square) 0.002 0.003
## Scaling correction factor 1.022
## Yuan-Bentler correction (Mplus variant)
##
## Model Test Baseline Model:
##
## Test statistic 2949.557 2772.060
## Degrees of freedom 15 15
## P-value 0.000 0.000
## Scaling correction factor 1.064
##
## User Model versus Baseline Model:
##
## Comparative Fit Index (CFI) 0.995 0.995
## Tucker-Lewis Index (TLI) 0.988 0.987
##
## Robust Comparative Fit Index (CFI) 0.995
## Robust Tucker-Lewis Index (TLI) 0.988
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -23239.010 -23239.010
## Scaling correction factor 1.121
## for the MLR correction
## Loglikelihood unrestricted model (H1) -23228.779 -23228.779
## Scaling correction factor 1.093
## for the MLR correction
##
## Akaike (AIC) 46508.020 46508.020
## Bayesian (BIC) 46597.617 46597.617
## Sample-size adjusted Bayesian (BIC) 46549.957 46549.957
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.029 0.028
## 90 Percent confidence interval - lower 0.016 0.015
## 90 Percent confidence interval - upper 0.043 0.042
## P-value RMSEA <= 0.05 0.994 0.996
##
## Robust RMSEA 0.029
## 90 Percent confidence interval - lower 0.015
## 90 Percent confidence interval - upper 0.043
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.014 0.014
##
## Parameter Estimates:
##
## Standard errors Sandwich
## Information bread Observed
## Observed information based on Hessian
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## Language =~
## NIHTBX_PICVOCA 1.000 0.712 0.712
## NIHTBX_READING 1.042 0.048 21.683 0.000 0.742 0.742
## CognitiveFlexibity =~
## NIHTBX_FLANKER 1.000 0.721 0.721
## NIHTBX_PATTERN 0.749 0.058 12.814 0.000 0.541 0.541
## MemoryRecall =~
## NIHTBX_PICTURE 1.000 0.603 0.603
## PEA_RAVLT_LD_T 1.152 0.068 16.872 0.000 0.694 0.695
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## Language ~~
## CognitivFlxbty 0.266 0.019 14.102 0.000 0.518 0.518
## MemoryRecall 0.272 0.017 15.811 0.000 0.633 0.633
## CognitiveFlexibity ~~
## MemoryRecall 0.193 0.017 11.130 0.000 0.443 0.443
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .NIHTBX_PICVOCA 0.493 0.025 19.590 0.000 0.493 0.493
## .NIHTBX_READING 0.449 0.026 17.058 0.000 0.449 0.449
## .NIHTBX_FLANKER 0.479 0.040 12.110 0.000 0.479 0.479
## .NIHTBX_PATTERN 0.707 0.030 23.896 0.000 0.707 0.708
## .NIHTBX_PICTURE 0.637 0.026 24.382 0.000 0.637 0.637
## .PEA_RAVLT_LD_T 0.517 0.032 16.296 0.000 0.517 0.518
## Language 0.507 0.031 16.267 0.000 1.000 1.000
## CognitivFlxbty 0.520 0.047 11.106 0.000 1.000 1.000
## MemoryRecall 0.363 0.028 12.876 0.000 1.000 1.000
##
## R-Square:
## Estimate
## NIHTBX_PICVOCA 0.507
## NIHTBX_READING 0.551
## NIHTBX_FLANKER 0.521
## NIHTBX_PATTERN 0.292
## NIHTBX_PICTURE 0.363
## PEA_RAVLT_LD_T 0.482
Plabels = c("Vocab","Reading","Flanker","Pattern","Picture","RAVLT","Language","Mental\nFlexibity","Memory\nRecall")
semPlot::semPaths(object=NeuroCog1stOrder.Fit,
intercepts = F, residuals = F,
whatLabels="no",
what = "std",
layout="tree",
node.width = 1.4,
edge.label.cex = .6,
nodeLabels=Plabels,
edge.color="black",
sizeMan = 10, sizeLat = 10)
lavaan::summary(NeuroCog2ndOrder.Fit, standardized = TRUE, rsquare = TRUE, fit.measures = TRUE)
## lavaan 0.6-11 ended normally after 29 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of model parameters 15
##
## Number of observations 2902
##
## Model Test User Model:
## Standard Robust
## Test Statistic 20.462 20.029
## Degrees of freedom 6 6
## P-value (Chi-square) 0.002 0.003
## Scaling correction factor 1.022
## Yuan-Bentler correction (Mplus variant)
##
## Model Test Baseline Model:
##
## Test statistic 2949.557 2772.060
## Degrees of freedom 15 15
## P-value 0.000 0.000
## Scaling correction factor 1.064
##
## User Model versus Baseline Model:
##
## Comparative Fit Index (CFI) 0.995 0.995
## Tucker-Lewis Index (TLI) 0.988 0.987
##
## Robust Comparative Fit Index (CFI) 0.995
## Robust Tucker-Lewis Index (TLI) 0.988
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -23239.010 -23239.010
## Scaling correction factor 1.121
## for the MLR correction
## Loglikelihood unrestricted model (H1) -23228.779 -23228.779
## Scaling correction factor 1.093
## for the MLR correction
##
## Akaike (AIC) 46508.020 46508.020
## Bayesian (BIC) 46597.617 46597.617
## Sample-size adjusted Bayesian (BIC) 46549.957 46549.957
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.029 0.028
## 90 Percent confidence interval - lower 0.016 0.015
## 90 Percent confidence interval - upper 0.043 0.042
## P-value RMSEA <= 0.05 0.994 0.996
##
## Robust RMSEA 0.029
## 90 Percent confidence interval - lower 0.015
## 90 Percent confidence interval - upper 0.043
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.014 0.014
##
## Parameter Estimates:
##
## Standard errors Sandwich
## Information bread Observed
## Observed information based on Hessian
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## Language =~
## NIHTBX_PICVOCA 1.000 0.712 0.712
## NIHTBX_READING 1.042 0.048 21.683 0.000 0.742 0.742
## CognitiveFlexibity =~
## NIHTBX_FLANKER 1.000 0.721 0.721
## NIHTBX_PATTERN 0.749 0.058 12.814 0.000 0.541 0.541
## MemoryRecall =~
## NIHTBX_PICTURE 1.000 0.603 0.603
## PEA_RAVLT_LD_T 1.152 0.068 16.872 0.000 0.694 0.695
## g =~
## Language 0.613 0.029 20.816 0.000 0.860 0.860
## CognitivFlxbty 0.434 0.025 17.213 0.000 0.602 0.602
## MemoryRecall 0.444 0.027 16.494 0.000 0.736 0.736
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## g 1.000 1.000 1.000
## .NIHTBX_PICVOCA 0.493 0.025 19.590 0.000 0.493 0.493
## .NIHTBX_READING 0.449 0.026 17.058 0.000 0.449 0.449
## .NIHTBX_FLANKER 0.479 0.040 12.110 0.000 0.479 0.479
## .NIHTBX_PATTERN 0.707 0.030 23.896 0.000 0.707 0.708
## .NIHTBX_PICTURE 0.637 0.026 24.382 0.000 0.637 0.637
## .PEA_RAVLT_LD_T 0.517 0.032 16.296 0.000 0.517 0.518
## .Language 0.132 0.029 4.505 0.000 0.260 0.260
## .CognitivFlxbty 0.332 0.039 8.535 0.000 0.637 0.637
## .MemoryRecall 0.166 0.020 8.361 0.000 0.458 0.458
##
## R-Square:
## Estimate
## NIHTBX_PICVOCA 0.507
## NIHTBX_READING 0.551
## NIHTBX_FLANKER 0.521
## NIHTBX_PATTERN 0.292
## NIHTBX_PICTURE 0.363
## PEA_RAVLT_LD_T 0.482
## Language 0.740
## CognitivFlxbty 0.363
## MemoryRecall 0.542
Plabels = c("Vocab","Reading","Flanker","Pattern","Picture","RAVLT","Language","Mental\nFlexibity","Memory\nRecall",expression(paste(italic("g"))))
semPlot::semPaths(object=NeuroCog2ndOrder.Fit,intercepts = F, residuals = F,
whatLabels="no", what = "std", layout="tree", node.width = 1.4,
edge.label.cex = .6, nodeLabels=Plabels, edge.color="black",
sizeMan = 10, sizeLat = 10)
semTools::reliabilityL2(NeuroCog2ndOrder.Fit,"g")
## omegaL1 omegaL2 partialOmegaL1
## 0.6103282 0.7792178 0.7282842
lavaan::summary(NeuroCogSingleFactor.Fit, standardized = TRUE, rsquare = TRUE, fit.measures = TRUE)
## lavaan 0.6-11 ended normally after 20 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of model parameters 12
##
## Number of observations 2902
##
## Model Test User Model:
## Standard Robust
## Test Statistic 513.874 510.350
## Degrees of freedom 9 9
## P-value (Chi-square) 0.000 0.000
## Scaling correction factor 1.007
## Yuan-Bentler correction (Mplus variant)
##
## Model Test Baseline Model:
##
## Test statistic 2949.557 2772.060
## Degrees of freedom 15 15
## P-value 0.000 0.000
## Scaling correction factor 1.064
##
## User Model versus Baseline Model:
##
## Comparative Fit Index (CFI) 0.828 0.818
## Tucker-Lewis Index (TLI) 0.713 0.697
##
## Robust Comparative Fit Index (CFI) 0.828
## Robust Tucker-Lewis Index (TLI) 0.713
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -23485.716 -23485.716
## Scaling correction factor 1.157
## for the MLR correction
## Loglikelihood unrestricted model (H1) -23228.779 -23228.779
## Scaling correction factor 1.093
## for the MLR correction
##
## Akaike (AIC) 46995.431 46995.431
## Bayesian (BIC) 47067.109 47067.109
## Sample-size adjusted Bayesian (BIC) 47028.981 47028.981
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.139 0.139
## 90 Percent confidence interval - lower 0.129 0.128
## 90 Percent confidence interval - upper 0.149 0.149
## P-value RMSEA <= 0.05 0.000 0.000
##
## Robust RMSEA 0.139
## 90 Percent confidence interval - lower 0.129
## 90 Percent confidence interval - upper 0.149
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.070 0.070
##
## Parameter Estimates:
##
## Standard errors Sandwich
## Information bread Observed
## Observed information based on Hessian
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## g =~
## NIHTBX_PICVOCA 1.000 0.668 0.668
## NIHTBX_READING 1.028 0.035 29.109 0.000 0.687 0.687
## NIHTBX_FLANKER 0.670 0.045 14.852 0.000 0.447 0.447
## NIHTBX_PATTERN 0.537 0.042 12.642 0.000 0.358 0.358
## NIHTBX_PICTURE 0.706 0.046 15.506 0.000 0.472 0.472
## PEA_RAVLT_LD_T 0.787 0.045 17.566 0.000 0.526 0.526
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .NIHTBX_PICVOCA 0.554 0.024 23.532 0.000 0.554 0.554
## .NIHTBX_READING 0.528 0.025 21.063 0.000 0.528 0.528
## .NIHTBX_FLANKER 0.800 0.026 30.533 0.000 0.800 0.800
## .NIHTBX_PATTERN 0.871 0.025 34.266 0.000 0.871 0.872
## .NIHTBX_PICTURE 0.777 0.024 32.574 0.000 0.777 0.777
## .PEA_RAVLT_LD_T 0.723 0.026 28.200 0.000 0.723 0.724
## g 0.446 0.029 15.272 0.000 1.000 1.000
##
## R-Square:
## Estimate
## NIHTBX_PICVOCA 0.446
## NIHTBX_READING 0.472
## NIHTBX_FLANKER 0.200
## NIHTBX_PATTERN 0.128
## NIHTBX_PICTURE 0.223
## PEA_RAVLT_LD_T 0.276
Plabels = c("Vocab","Reading","Flanker","Pattern","Picture","RAVLT", expression(paste(italic("g"))))
semPlot::semPaths(object=NeuroCogSingleFactor.Fit,
intercepts = F, residuals = F,
whatLabels="no",
what = "std",
layout="tree",
node.width = 1.4,
edge.label.cex = .6,
nodeLabels=Plabels,
edge.color="black",
sizeMan = 10, sizeLat = 10)
data_first_year <- bind_rows(data_train,data_valid,data_baseline)
NeuroCog2ndOrder <-'
Language =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED
CognitiveFlexibity =~ NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED
MemoryRecall =~ NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC
g =~ NA*Language + CognitiveFlexibity + MemoryRecall #estimate the loading of GenAbi -> as opposed to using it as a marker
g ~~ 1*g #need to constrain variance to 1'
NeuroCog2ndOrder.Fit_First_year <- lavaan::cfa(model = NeuroCog2ndOrder,
data = data_first_year,
estimator="MLR")
CFA_NeuroCog2ndOrder_first_year <-
data_first_year %>% select(SUBJECTKEY) %>%
bind_cols(
lavaan::lavPredict(NeuroCog2ndOrder.Fit_First_year,
newdata =data_first_year) %>% as_tibble() %>%
rename(g_2ndOrder_first_year = g)) %>%
select(SUBJECTKEY, g_2ndOrder_first_year)
CFA_all_first_year <-
CFA_response$enet_tuning %>% bind_cols(fold = "first_layer") %>%
bind_rows(CFA_response$RanFor_tuning %>% bind_cols(fold = "second_layer") %>%
bind_rows(CFA_response$testing_baseline %>% bind_cols(fold = "baseline_test"))) %>%
left_join(CFA_NeuroCog2ndOrder_first_year, by = "SUBJECTKEY")
CFA_all_first_year %>%
group_by(fold) %>%
summarize (cor=cor (g_second_order, g_2ndOrder_first_year))
## # A tibble: 3 x 2
## fold cor
## <chr> <dbl>
## 1 baseline_test 0.997
## 2 first_layer 0.997
## 3 second_layer 0.997
fold.labs <- c("1st-layer train\nr = .9973686", "2nd-layer train\nr = .9973635", "Baseline Test\nr = .9970127")
names(fold.labs) <- c("first_layer", "second_layer","baseline_test" )
CFA_all_first_year %>%
mutate(fold=factor(fold, levels=c("first_layer", "second_layer","baseline_test"))) %>%
ggplot(aes(x=g_second_order, y=g_2ndOrder_first_year)) +
geom_point() +
geom_smooth(method=lm, color = "grey") +
facet_wrap(~fold, ncol = 3, nrow = 1,
labeller = labeller(fold = fold.labs)) +
labs(x = expression(paste("2nd-order ",italic("g"), "-Factor Based on 1st-Layer Train Data")),
y = ~ atop(paste("2nd-order ",italic("g"),"-Factor"), "Based on Full Baseline Data")) +
theme(legend.position = "none") +
#ggpubr::stat_cor(method = "pearson") + #label.x = -5, label.y = 30)
theme(text=element_text(size=19))
## Don't know how to automatically pick scale for object of type lavaan.matrix/matrix. Defaulting to continuous.
## `geom_smooth()` using formula 'y ~ x'
lowerFn <- function(data, mapping, continuous = "cor", ...) {
p <- ggplot(data = data, mapping = mapping) +
geom_point(colour = "black") +
geom_smooth(color = "grey", ...)
p
}
CFA_all_first_year_g <- CFA_all_first_year %>%
select(g_second_order,g_single_factor,g_EFA_CFA)
colnames(CFA_all_first_year_g) <-
make.names(c('2nd Order', 'Single Factor', 'EFA CFA'))
g_first_layer_plot <- CFA_all_first_year %>%
filter(fold == "first_layer") %>%
select(g_second_order,g_single_factor,g_EFA_CFA) %>%
GGally::ggpairs(.,
title = "1st-Layer Train Data",
columnLabels = gsub('X', '', gsub('.', '\n', colnames(CFA_all_first_year_g), fixed = T)),
lower = list(continuous = lowerFn))
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
g_second_layer_plot <- CFA_all_first_year %>%
filter(fold == "second_layer") %>%
select(g_second_order,g_single_factor,g_EFA_CFA) %>%
GGally::ggpairs(.,
title = "2nd-Layer Train Data",
columnLabels = gsub('X', '', gsub('.', '\n', colnames(CFA_all_first_year_g), fixed = T)),
lower = list(continuous = lowerFn))
g_baseline_test_plot <- CFA_all_first_year %>%
filter(fold == "baseline_test") %>%
select(g_second_order,g_single_factor,g_EFA_CFA) %>%
GGally::ggpairs(.,
title = "Baseline Test Data",
columnLabels = gsub('X', '', gsub('.', '\n', colnames(CFA_all_first_year_g), fixed = T)),
lower = list(continuous = lowerFn))
g_followup_test_plot <-
CFA_response$testing_followup %>%
select(g_second_order,g_single_factor,g_EFA_CFA) %>%
GGally::ggpairs(.,
title = "Follow-up Test Data",
columnLabels = gsub('X', '', gsub('.', '\n', colnames(CFA_all_first_year_g), fixed = T)),
lower = list(continuous = lowerFn))
G_plot_grid <- cowplot::plot_grid(
GGally::ggmatrix_gtable(g_first_layer_plot),
GGally::ggmatrix_gtable(g_second_layer_plot),
GGally::ggmatrix_gtable(g_baseline_test_plot),
GGally::ggmatrix_gtable(g_followup_test_plot),
# ggmatrix_gtable(g3),
nrow = 2
)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# now add the title
title <- ggdraw() +
draw_label(
expression(paste("Relationship among ",italic("g"), "-Factor Based on Different CFA models at Different Data Splits")),
fontface = 'bold',
x = 0,
hjust = 0
) +
theme(
# add margin on the left of the drawing canvas,
# so title is aligned with left edge of first plot
plot.margin = margin(0, 0, 0, 7)
)
plot_grid(
title, G_plot_grid,
ncol = 1,
# rel_heights values control vertical title margins
rel_heights = c(0.05, 1)
)
#CFA_Resp_Var <- names(CFA_response[["enet_tuning"]])[-1]%>% set_names()
## to focus on g_second_order, remove g_single_factora and g_EFA_CFA
CFA_Resp_Var <- names(CFA_response[["enet_tuning"]] %>%
select(-g_single_factor, -g_EFA_CFA))[-1]%>% set_names()
train_CFA <- left_join(CFA_response[["enet_tuning"]],
analysis( baseline_two_fold$splits[[1]]),
by = "SUBJECTKEY")
valid_CFA <- left_join(CFA_response[["RanFor_tuning"]],
assessment( baseline_two_fold$splits[[1]]),
by = "SUBJECTKEY")
baseline_CFA <- left_join(CFA_response[["testing_baseline"]],
baseline_test,
by = "SUBJECTKEY")
followup_CFA <- left_join(CFA_response[["testing_followup"]],
followup_analysis,
by = "SUBJECTKEY")
nrow(train_CFA)
## [1] 2906
nrow(valid_CFA)
## [1] 2917
nrow(baseline_CFA )
## [1] 5427
nrow(followup_CFA )
## [1] 5280
GDistData <- data.table::data.table(G_Factor = train_CFA$g_second_order, Split = "1st-Layer Train") %>%
rbind(data.table::data.table(G_Factor = valid_CFA$g_second_order, Split = "2nd-Layer Train")) %>%
rbind(data.table::data.table(G_Factor = baseline_CFA$g_second_order, Split = "Baseline Test")) %>%
rbind(data.table::data.table(G_Factor = followup_CFA$g_second_order, Split = "Follow-Up Test"))
ggplot(data = GDistData, aes(x = G_Factor, color = Split)) +
geom_density() +
labs(x= expression(paste("2nd-order ",italic("g")))) +
labs(y= "Density") +
theme(legend.title=element_blank()) +
theme(text = element_text(size=30)) +
theme(legend.position="top", legend.box="vertical", legend.margin=margin()) +
guides(color=guide_legend(nrow=2,byrow=TRUE))
CFA_var_plotting <- tibble("response" =CFA_Resp_Var,
"longer_name"=c("1st-Order Language",
"1st-Order Mental Flexibility",
"1st-Order Memory Recall",
"2nd-Order G-Factor"),
"short_name"=c("Language",
"Mental Flexibility",
"Memory Recall",
"G-Factor"))
Nback_enet_CFA <- CFA_Resp_Var %>%
map(.,~enet_processing_nocom(resp_var = .,
split_train = train_CFA,
split_valid = valid_CFA,
split_test = baseline_CFA,
followup_test = followup_CFA,
contrast_name = "X2backvs0back_ROI_"))
SST_enet_CFA <- CFA_Resp_Var %>%
map(.,~enet_processing_nocom(resp_var = .,
split_train = train_CFA,
split_valid = valid_CFA,
split_test = baseline_CFA,
followup_test = followup_CFA ,
contrast_name = "anystopvscorrectgo_ROI_"))
MID_enet_CFA <- CFA_Resp_Var %>%
map(.,~enet_processing_nocom(resp_var = .,
split_train = train_CFA,
split_valid = valid_CFA,
split_test = baseline_CFA,
followup_test = followup_CFA,
contrast_name = "antiRewVsNeu_ROI_"))
DTI_enet_CFA <- CFA_Resp_Var %>%
map(.,~enet_processing_comOne(resp_var = .,
split_train = train_CFA,
split_valid = valid_CFA,
split_test = baseline_CFA,
followup_test = followup_CFA,
measure_one = "FA_"))
smri_enet_CFA <- CFA_Resp_Var %>%
map(.,~enet_processing_com(resp_var = .,
split_train = train_CFA,
split_valid = valid_CFA,
split_test = baseline_CFA,
followup_test = followup_CFA,
measure_one = "ASEG_",
measure_two = "Dest_Thick_"))
rsmri_enet_CFA <- CFA_Resp_Var %>%
map(.,~enet_processing_com(resp_var = .,
split_train = train_CFA,
split_valid = valid_CFA,
split_test = baseline_CFA,
followup_test = followup_CFA,
measure_one = "par",
measure_two = "Within"))
enet_CFA_data <- list("Nback"=Nback_enet_CFA,
"SST"=SST_enet_CFA,
"MID"=MID_enet_CFA,
"rsmri"=rsmri_enet_CFA,
"smri"=smri_enet_CFA,
"DTI"=DTI_enet_CFA)
##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)
Nback_CFA <- CFA_Resp_Var %>%
future_map(.,~enet_tuning(resp_var =.,
data_list = Nback_enet_CFA[[.]]),
.options = furrr_options(seed=TRUE))
SST_CFA <- CFA_Resp_Var %>%
future_map(.,~enet_tuning(resp_var =.,
data_list = SST_enet_CFA[[.]]),
.options = furrr_options(seed=TRUE))
MID_CFA <- CFA_Resp_Var %>%
future_map(.,~enet_tuning(resp_var =.,
data_list = MID_enet_CFA[[.]]),
.options = furrr_options(seed=TRUE))
DTI_CFA <- CFA_Resp_Var %>%
future_map(.,~enet_tuning(resp_var =.,
data_list = DTI_enet_CFA[[.]]),
.options = furrr_options(seed=TRUE))
smri_CFA <- CFA_Resp_Var %>%
future_map(.,~enet_tuning(resp_var =.,
data_list = smri_enet_CFA[[.]]),
.options = furrr_options(seed=TRUE))
rsmri_CFA <- CFA_Resp_Var %>%
future_map(.,~enet_tuning(resp_var =.,
data_list = rsmri_enet_CFA[[.]]),
.options = furrr_options(seed=TRUE))
enet_CFA <- list("Nback_results"=Nback_CFA,
"SST_results"=SST_CFA,
"MID_results"=MID_CFA,
"rsmri_results"=rsmri_CFA,
"smri_results"=smri_CFA,
"DTI_results"=DTI_CFA)
datasplits_number(data_list = enet_CFA,resp_vec = CFA_Resp_Var, resp_plotting = CFA_var_plotting)
[[1]]
modality | enet_train | stack_train | baseline | followup | short_name |
---|---|---|---|---|---|
Nback | 1369 | 1331 | 2653 | 2996 | Language |
Nback | 1369 | 1331 | 2653 | 2996 | Mental Flexibility |
Nback | 1369 | 1331 | 2653 | 2996 | Memory Recall |
Nback | 1369 | 1331 | 2653 | 2996 | G-Factor |
modality | enet_train | stack_train | baseline | followup | short_name |
---|---|---|---|---|---|
SST | 1412 | 1421 | 2791 | 2903 | Language |
SST | 1412 | 1421 | 2791 | 2903 | Mental Flexibility |
SST | 1412 | 1421 | 2791 | 2903 | Memory Recall |
SST | 1412 | 1421 | 2791 | 2903 | G-Factor |
modality | enet_train | stack_train | baseline | followup | short_name |
---|---|---|---|---|---|
MID | 1611 | 1591 | 3138 | 3127 | Language |
MID | 1611 | 1591 | 3138 | 3127 | Mental Flexibility |
MID | 1611 | 1591 | 3138 | 3127 | Memory Recall |
MID | 1611 | 1591 | 3138 | 3127 | G-Factor |
modality | enet_train | stack_train | baseline | followup | short_name |
---|---|---|---|---|---|
rsmri | 1988 | 2007 | 3779 | 4137 | Language |
rsmri | 1988 | 2007 | 3779 | 4137 | Mental Flexibility |
rsmri | 1988 | 2007 | 3779 | 4137 | Memory Recall |
rsmri | 1988 | 2007 | 3779 | 4137 | G-Factor |
modality | enet_train | stack_train | baseline | followup | short_name |
---|---|---|---|---|---|
smri | 2395 | 2428 | 4507 | 4569 | Language |
smri | 2395 | 2428 | 4507 | 4569 | Mental Flexibility |
smri | 2395 | 2428 | 4507 | 4569 | Memory Recall |
smri | 2395 | 2428 | 4507 | 4569 | G-Factor |
modality | enet_train | stack_train | baseline | followup | short_name |
---|---|---|---|---|---|
DTI | 2268 | 2260 | 4071 | 4230 | Language |
DTI | 2268 | 2260 | 4071 | 4230 | Mental Flexibility |
DTI | 2268 | 2260 | 4071 | 4230 | Memory Recall |
DTI | 2268 | 2260 | 4071 | 4230 | G-Factor |
modality_enet_CFA <-names(enet_CFA)
param_tune_enet <- modality_enet_CFA %>%
map(.,function(modality_input=.,list_input=enet_CFA){
tuned_results <- list_input[[modality_input]][["g_second_order"]][["param_tune"]]%>%
mutate(modality=modality_input)
return(tuned_results)
})%>% do.call(rbind,.)%>%
select("penalty", "mixture","modality")
param_tune_enet %>%
kableExtra::kbl(caption = "Tuned parameters for elastic net") %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
penalty | mixture | modality |
---|---|---|
1.1489510 | 0.0 | Nback_results |
0.0195640 | 1.0 | SST_results |
0.0252354 | 1.0 | MID_results |
0.6080224 | 0.0 | rsmri_results |
0.1162322 | 0.1 | smri_results |
0.1162322 | 0.0 | DTI_results |
# mean of penalty term 0.6190428
param_tune_enet %>% summarise_at("penalty", mean)
## # A tibble: 1 x 1
## penalty
## <dbl>
## 1 0.339
# sd of penalty term 0.4316607
param_tune_enet %>% summarise_at("penalty", sd)
## # A tibble: 1 x 1
## penalty
## <dbl>
## 1 0.453
enetXplorer allows us to use permutation to test the significance of each feature
enet_CFA_xplorer <- enet_CFA
names(enet_CFA_xplorer) <- modality_vec
##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)
enet_CFA_fit <- modality_vec %>%
map(.,~fitting_enetxplorer(resp_vec = CFA_Resp_Var,
modality_var = .,
data_input = enet_CFA_data,
tuning_input = enet_CFA_xplorer))
names(enet_CFA_fit)<- modality_vec
modality_vec %>% map(.,~enet_fit_table(modality_input=.,
fit_input = enet_CFA_fit,
tune_input = enet_CFA_xplorer,
resp_input=CFA_Resp_Var,
plotting_names = CFA_var_plotting))
[[1]]
Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
---|---|---|---|---|
0 | 1.1486 | 0.4058 | 0.0004 | Language |
0 | 2.0159 | 0.2895 | 0.0004 | Mental Flexibility |
0 | 2.1021 | 0.3251 | 0.0004 | Memory Recall |
0 | 1.2884 | 0.4097 | 0.0004 | G-Factor |
Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
---|---|---|---|---|
1.0 | 0.0076 | 0.1731 | 0.0004 | Language |
1.0 | 0.0211 | 0.1447 | 0.0004 | Mental Flexibility |
0.1 | 0.0087 | 0.1218 | 0.0012 | Memory Recall |
1.0 | 0.0071 | 0.1784 | 0.0004 | G-Factor |
Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
---|---|---|---|---|
1 | 0.0284 | 0.2756 | 0.0004 | Language |
0 | 2.2486 | 0.1614 | 0.0004 | Mental Flexibility |
0 | 1.5746 | 0.2128 | 0.0004 | Memory Recall |
1 | 0.0236 | 0.2649 | 0.0004 | G-Factor |
Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
---|---|---|---|---|
0 | 0.3352 | 0.2410 | 0.0004 | Language |
0 | 1.2950 | 0.1434 | 0.0004 | Mental Flexibility |
0 | 1.2108 | 0.1681 | 0.0004 | Memory Recall |
0 | 0.4319 | 0.2281 | 0.0004 | G-Factor |
Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
---|---|---|---|---|
0.1 | 0.0913 | 0.2483 | 0.0004 | Language |
0.0 | 2.6983 | 0.1540 | 0.0004 | Mental Flexibility |
0.0 | 2.0348 | 0.1881 | 0.0004 | Memory Recall |
0.1 | 0.1134 | 0.2387 | 0.0004 | G-Factor |
Alpha | Best-tune lambda | Accuracy (Pearson r) | P-value | respones |
---|---|---|---|---|
0 | 0.0128 | 0.2370 | 0.0004 | Language |
1 | 0.0002 | 0.1776 | 0.0004 | Mental Flexibility |
0 | 0.0394 | 0.1758 | 0.0004 | Memory Recall |
0 | 0.0134 | 0.2292 | 0.0004 | G-Factor |
map2(.x=modality_type_vec,
.y=str_type_vec,
~enet_interval_plot(data_input=enet_CFA_fit,
tuning_input=enet_CFA_xplorer,
resp_input =CFA_Resp_Var,
modality_input = .x,
str_input = .y,
plotting_names = CFA_var_plotting))
## [[1]]
## [[1]]$Lang_first_order
##
## [[1]]$CogFlex_first_order
##
## [[1]]$MemRe_first_order
##
## [[1]]$g_second_order
##
##
## [[2]]
## [[2]]$Lang_first_order
##
## [[2]]$CogFlex_first_order
##
## [[2]]$MemRe_first_order
##
## [[2]]$g_second_order
##
##
## [[3]]
## [[3]]$Lang_first_order
##
## [[3]]$CogFlex_first_order
##
## [[3]]$MemRe_first_order
##
## [[3]]$g_second_order
enet_interval_plot_smri(data_input=enet_CFA_fit,tuning_input=enet_CFA,resp_input = CFA_Resp_Var, plotting_names = CFA_var_plotting)
## $Lang_first_order
##
## $CogFlex_first_order
##
## $MemRe_first_order
##
## $g_second_order
### Plot results from eNetExplorer with rs-fMRI predicting each factor score
enet_interval_plot_rsmri(data_input=enet_CFA_fit,tuning_input=enet_CFA,resp_input = CFA_Resp_Var, plotting_names = CFA_var_plotting)
## $Lang_first_order
##
## $CogFlex_first_order
##
## $MemRe_first_order
##
## $g_second_order
enet_interval_plot_DTI(data_input=enet_CFA_fit,tuning_input=enet_CFA,resp_input = CFA_Resp_Var, plotting_names = CFA_var_plotting)
## $Lang_first_order
##
## $CogFlex_first_order
##
## $MemRe_first_order
##
## $g_second_order
library(ggseg)
library(ggsegExtra)
## Loading required package: ggseg3d
## Registered S3 method overwritten by 'geomorph':
## method from
## print.pls parsnip
library(ggsegDesterieux)
modality_type_vec <- list("Nback","SST","MID","smri")
# modality_word_vec <- list("NBack-fMRI", "SST-fMRI","MID-fMRI","sMRI")
modality_type_word_vec <- list(c("Nback","NBack-fMRI"),
c("SST","SST-fMRI"),
c("MID","MID-fMRI"),
c("smri","sMRI"))
str_type_vec <- list("X2backvs0back_ROI_",
"anystopvscorrectgo_ROI_",
"antiRewVsNeu_ROI_",
c("ASEG_","Dest_Thick_"))
#CFA_Resp_vec <- c("Lang_first_order", "CogFlex_first_order", "MemRe_first_order", "g_second_order")
brainPrepTibFunc <- function(enet_CFA_fit,
modality_type,
str_type,
CFA_Resp) {
if (modality_type[1] == "smri") {
brainPlotTib <- extract_tibble(enet_CFA_fit[[modality_type[1]]][[CFA_Resp]],
alpha_index = paste0("a",
enet_CFA_xplorer[[modality_type[1]]][[CFA_Resp]][["param_tune"]][["mixture"]])) %>%
filter(type == "Target") %>%
mutate(.,label = str_replace_all(variable, str_type[1], '')) %>%
mutate(.,label = str_replace_all(label, str_type[2], '')) %>%
mutate(.,label = str_replace_all(label, '\\.', '-')) %>%
mutate(.,label = str_replace_all(label, 'Brain-Stem', 'brain-stem')) %>%
mutate(.,label = str_replace_all(label, 'Right-Cerebellum-Cortex', 'right-cerebellum-cortex'))
}
else{
brainPlotTib <- extract_tibble(enet_CFA_fit[[modality_type[1]]][[CFA_Resp]],
alpha_index = paste0("a",
enet_CFA_xplorer[[modality_type[1]]][[CFA_Resp]][["param_tune"]][["mixture"]])) %>%
filter(type == "Target") %>%
mutate(.,label = str_replace_all(variable, str_type, '')) %>%
mutate(.,label = str_replace_all(label, '\\.', '-')) %>%
mutate(.,label = str_replace_all(label, 'Brain-Stem', 'brain-stem')) %>%
mutate(.,label = str_replace_all(label, 'Right-Cerebellum-Cortex', 'right-cerebellum-cortex'))
}
coefs_ggsegDes_all <-
ggsegDesterieux::desterieux %>% as_tibble %>%
select(label) %>%
na.omit() %>%
left_join(select(brainPlotTib,
label,
wmean,
pvalue,
variable), by = "label")
coefs_ggsegAseg_all <-
ggseg::aseg$data %>% as_tibble %>%
select(label) %>%
na.omit() %>%
left_join(select(brainPlotTib,
label,
wmean,
pvalue,
variable), by = "label")
coefs_ggsegMax <- max(max(coefs_ggsegDes_all$wmean, na.rm = TRUE),
max(coefs_ggsegAseg_all$wmean, na.rm = TRUE))
coefs_ggsegMin <- min(min(coefs_ggsegDes_all$wmean, na.rm = TRUE),
min(coefs_ggsegAseg_all$wmean, na.rm = TRUE))
plot_ggsegDes_all <-
ggsegDesterieux::desterieux %>% as_tibble %>%
right_join(coefs_ggsegDes_all, by ="label") %>%
select(label, wmean, pvalue) %>% filter(pvalue <.05) %>%
ggseg(atlas = 'desterieux',
mapping = aes(fill = wmean),
colour="black"
) +
theme_void() +
scale_fill_gradient2(
limits = c(coefs_ggsegMin, coefs_ggsegMax),
midpoint = 0, low = "blue", mid = "white",
high = "red", space = "Lab", na.value="transparent" ) +
theme(legend.position = "none")+
labs(title =paste0(modality_type[2]))
plot_ggsegAseg_all <-
ggseg::aseg$data %>% as_tibble %>%
right_join(coefs_ggsegAseg_all, by ="label") %>%
select(label, wmean, pvalue) %>% filter(pvalue <.05) %>%
ggseg(atlas="aseg",
mapping=aes(fill=wmean),
view = "axial",
colour="black") +
theme_void() +
scale_fill_gradient2(
limits = c(coefs_ggsegMin, coefs_ggsegMax),
midpoint = 0, low = "blue", mid = "white",
high = "red", space = "Lab", na.value="transparent" ) +
guides(fill = guide_colourbar(barwidth = 0.5, barheight = 3, title = NULL))
theme(legend.position = "none")
plotPreptaskfMRI <-
list(brainPlotTib,
coefs_ggsegDes_all,
coefs_ggsegAseg_all,
coefs_ggsegMax,
coefs_ggsegMin,
plot_ggsegDes_all,
plot_ggsegAseg_all) %>%
purrr::set_names("brainPlotTib","coefs_ggsegDes_all", "coefs_ggsegAseg_all",
"coefs_ggsegMax","coefs_ggsegMin","plot_ggsegDes_all", "plot_ggsegAseg_all")
return(plotPreptaskfMRI)
}
plotPreptaskfMRI <-
modality_type_word_vec %>%
purrr::set_names(purrr::map(.,1)) %>%
purrr::map2(.x = .,
.y = str_type_vec,
~ brainPrepTibFunc(enet_CFA_fit = enet_CFA_fit,
modality_type = .x,
str_type = .y,
CFA_Resp = "g_second_order"))
## merging atlas and data by 'label'
## merging atlas and data by 'label'
## Warning: Some data not merged properly. Check for naming errors in data:
## atlas type hemi side region label wmean pvalue
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 <NA> <NA> <NA> <NA> <NA> right-cerebellum-cortex 0.0301 0.00120
## merging atlas and data by 'label'
## merging atlas and data by 'label'
## merging atlas and data by 'label'
## merging atlas and data by 'label'
## merging atlas and data by 'label'
## merging atlas and data by 'label'
ggpubr::ggarrange(plotlist=purrr::map(plotPreptaskfMRI,"plot_ggsegDes_all")
,nrow=4)
# ggpubr::ggarrange(plotlist=purrr::map(plotPreptaskfMRI,"plot_ggsegAs_all")
# ,nrow=4)
#
modality_type_vec %>% map(.,~gridExtra::grid.arrange(plotPreptaskfMRI[[.]][["plot_ggsegDes_all"]],
plotPreptaskfMRI[[.]][["plot_ggsegAseg_all"]],
nrow = 1, ncol = 2, widths = c(4, 1.5)))
## [[1]]
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
##
## [[2]]
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
##
## [[3]]
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
##
## [[4]]
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
library(ggsegJHU)
HaglerToJhu <- read_csv(paste0(utilFold, "HaglerToJhu.csv"))
## Rows: 44 Columns: 5
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): hemi, region, side, label, Hagler
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
DTITrib <- extract_tibble(enet_CFA_fit[["DTI"]][["g_second_order"]],
alpha_index = paste0("a", enet_CFA_xplorer[["DTI"]][["g_second_order"]][["param_tune"]][["mixture"]])) %>%
filter(type == "Target") %>%
mutate(.,Hagler = str_replace_all(variable, "FA_", ''))
ggsegJHU::jhu %>% as_tibble %>%
left_join(HaglerToJhu, by = c("hemi","region","side","label")) %>%
left_join(DTITrib %>% select(-type), by = "Hagler") %>%
filter(pvalue <.05) %>%
select( -geometry) %>%
ggseg(atlas = 'jhu',
mapping = aes(fill = wmean),
colour="black") +
theme_void() +
theme(text = element_text(size=30)) +
labs(title = "DTI") +
scale_fill_gradient2(
# limits = c(coefs_ggsegMin, coefs_ggsegMax),
midpoint = 0, low = "blue", mid = "white",
high = "red", space = "Lab", na.value="transparent" ) +
guides(fill = guide_colourbar(barwidth = 1.5, barheight = 9, title = NULL))
## merging atlas and data by 'atlas', 'type', 'hemi', 'region', 'side', 'label'
theme(legend.position = "none")
## List of 1
## $ legend.position: chr "none"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
enet_rf_valid_CFA <- CFA_Resp_Var %>%
map(.,~rnd_for_data_prep(pred_select = "predicted",
data_list = enet_CFA,
resp_chose = .,
resp_data = valid_CFA))
enet_rf_test_baseline_CFA <- CFA_Resp_Var %>%
map(.,~rnd_for_data_prep(pred_select="predicted_baseline",
data_list = enet_CFA,
resp_chose = .,
resp_data = baseline_CFA))
enet_rf_test_followup_CFA <- CFA_Resp_Var %>%
map(.,~rnd_for_data_prep(pred_select="predicted_followup",
data_list = enet_CFA,
resp_chose = .,
resp_data = followup_CFA))
CFA_stack_valid <- CFA_Resp_Var %>%
map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_valid_CFA))%>%
do.call(rbind,.)%>%
mutate(number_valid = datasplit_number)%>%
subset(.,select=-c(datasplit_number))
CFA_stack_baseline <- CFA_Resp_Var %>%
map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_test_baseline_CFA))%>%
do.call(rbind,.)%>%
mutate(baseline_test = datasplit_number)%>%
subset(.,select=-c(datasplit_number))
CFA_stack_followup <- CFA_Resp_Var %>%
map(.,~stack_datasize_extract(resp_input = .,data_input = enet_rf_test_followup_CFA))%>%
do.call(rbind,.)%>%
mutate(followup_test = datasplit_number)%>%
subset(.,select=-c(datasplit_number))
plyr::join_all(list(CFA_stack_valid,CFA_stack_baseline,CFA_stack_followup,CFA_var_plotting),
type="left", by = "response")%>%
select("short_name","number_valid",ends_with("test"))%>%
kableExtra::kbl(caption = paste0("Sample size of all the data sets in stacked models ")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
short_name | number_valid | baseline_test | followup_test |
---|---|---|---|
Language | 2740 | 5108 | 5119 |
Mental Flexibility | 2740 | 5108 | 5119 |
Memory Recall | 2740 | 5108 | 5119 |
G-Factor | 2740 | 5108 | 5119 |
##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)
rf_fit_CFA <- CFA_Resp_Var %>%
future_map(.,~random_forest_tuning(resp=.,
valid_list = enet_rf_valid_CFA,
baseline_test = enet_rf_test_baseline_CFA,
followup_test = enet_rf_test_followup_CFA),
.options = furrr_options(seed=TRUE),
.progress = TRUE)
CFA_rf_results <- random_fores_output_print(rf_output = rf_fit_CFA,
enet_output = enet_CFA,
response_vec=CFA_Resp_Var,
names_plotting = CFA_var_plotting)
CFA_rf_results[["rf_performance"]]%>%
rename("Response Variable" = respones,
"Baseline" = "Baseline (Pearson r)",
"Followup" = "Followup (Pearson r)") %>%
select("Response Variable",
"Baseline","Followup",
"#predictors/split",
"#observations/split") %>%
mutate(across(2:3, round, 3)) %>%
kableExtra::kbl(caption = paste0("Performance (Pearson r) and Hyperparameters of Random Forest Stacking for Factor Scores")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
Response Variable | Baseline | Followup | #predictors/split | #observations/split |
---|---|---|---|---|
Language | 0.423 | 0.418 | 4 | 160 |
Mental Flexibility | 0.314 | 0.305 | 8 | 350 |
Memory Recall | 0.369 | 0.348 | 4 | 200 |
G-Factor | 0.434 | 0.414 | 4 | 170 |
CFA_rf_results[["rf_enet_baseline"]] %>%
rename("Response Variable" = short_name,
"NBack-fMRI" = Nback,
"SST-fMRI" = SST,
"MID-fMRI" = MID,
"rs-fMRI" = rsmri,
"sMRI" = smri,
"Stacked" = random_forest_baseline) %>%
select("Response Variable",
"Stacked",
"NBack-fMRI",
"MID-fMRI",
"SST-fMRI",
"rs-fMRI",
"sMRI",
"DTI") %>%
mutate(across(2:8, round, 3)) %>%
kableExtra::kbl(caption = paste0("Performance of Random Forest Stacking VS Elastic Net on Each Modality for Each Factor Score\n at the Baseline(Pearson r)")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
Response Variable | Stacked | NBack-fMRI | MID-fMRI | SST-fMRI | rs-fMRI | sMRI | DTI |
---|---|---|---|---|---|---|---|
Language | 0.423 | 0.388 | 0.197 | 0.140 | 0.228 | 0.211 | 0.192 |
Mental Flexibility | 0.314 | 0.313 | 0.144 | 0.063 | 0.133 | 0.132 | 0.144 |
Memory Recall | 0.369 | 0.315 | 0.169 | 0.082 | 0.182 | 0.179 | 0.166 |
G-Factor | 0.434 | 0.402 | 0.202 | 0.129 | 0.224 | 0.210 | 0.193 |
CFA_rf_results[["rf_enet_followup"]] %>%
rename("Response Variable" = short_name,
"NBack-fMRI" = Nback,
"SST-fMRI" = SST,
"MID-fMRI" = MID,
"rs-fMRI" = rsmri,
"sMRI" = smri,
"Stacked" = random_forest_followup) %>%
select("Response Variable",
"Stacked",
"NBack-fMRI",
"MID-fMRI",
"SST-fMRI",
"rs-fMRI",
"sMRI",
"DTI") %>%
mutate(across(2:8, round, 3)) %>%
kableExtra::kbl(caption = paste0("Performance of Random Forest Stacking VS Elastic Net on Each Modality for Each Factor Score\n at the Follow-Up (Pearson r)")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
Response Variable | Stacked | NBack-fMRI | MID-fMRI | SST-fMRI | rs-fMRI | sMRI | DTI |
---|---|---|---|---|---|---|---|
Language | 0.418 | 0.370 | 0.149 | 0.150 | 0.254 | 0.236 | 0.211 |
Mental Flexibility | 0.305 | 0.292 | 0.090 | 0.074 | 0.166 | 0.156 | 0.173 |
Memory Recall | 0.348 | 0.319 | 0.123 | 0.130 | 0.206 | 0.189 | 0.188 |
G-Factor | 0.414 | 0.383 | 0.148 | 0.145 | 0.253 | 0.230 | 0.217 |
CFA_Resp_Var%>% map(.,~rf_fit_CFA[[.]][["model_fit"]] %>%
vip(geom = "point"))
## $Lang_first_order
##
## $CogFlex_first_order
##
## $MemRe_first_order
##
## $g_second_order
enet_rf_valid_CFA_permu <- enet_rf_valid_CFA %>% map(.,function(data_input=.){
data_output<- data_input[["NoID"]] %>%
dplyr::rename("MID_large"="MID_pred.x",
"SST_large"="SST_pred.x",
"smri_large"="smri_pred.x",
"rsmri_large"="rsmri_pred.x",
"DTI_large"="DTI_pred.x",
"Nback_large"="Nback_pred.x",
"MID_small"="MID_pred.y",
"SST_small"="SST_pred.y",
"smri_small"="smri_pred.y",
"rsmri_small"="rsmri_pred.y",
"DTI_small"="DTI_pred.y",
"Nback_small"="Nback_pred.y")
return(data_output)
})
cf_fit_CFA <- CFA_Resp_Var %>% map(.,function(resp_var=.){
cforest(as.formula(paste0(resp_var, "~ .")),
data=enet_rf_valid_CFA_permu[[resp_var]],
control= cforest_unbiased(mtry=rf_fit_CFA[[resp_var]][["param_tune"]]$mtry,
ntree=1000,
minsplit=rf_fit_CFA[[resp_var]][["param_tune"]]$min_n))
})
PI_permimp_CFA <-cf_fit_CFA %>%
map(.,~permimp(.,progressBar = FALSE))
CFA_Resp_Var %>%
map(.,~plot(PI_permimp_CFA[[.]], type = "box",
horizontal = TRUE,
main=paste0("Conditional Permutation Importance ",
CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]])))
## $Lang_first_order
## NULL
##
## $CogFlex_first_order
## NULL
##
## $MemRe_first_order
## NULL
##
## $g_second_order
## NULL
CFA_Resp_Var %>%
map(.,~plot(PI_permimp_CFA[[.]],
type = "bar",
interval="quantile",
main=paste0("Conditional Permutation Importance ",
CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]])))
## $Lang_first_order
## NULL
##
## $CogFlex_first_order
## NULL
##
## $MemRe_first_order
## NULL
##
## $g_second_order
## NULL
library("fastshap")
##
## Attaching package: 'fastshap'
## The following object is masked from 'package:vip':
##
## gen_friedman
## The following object is masked from 'package:dplyr':
##
## explain
##need to test this works for all the models
model_pred_fun <- function(object, newdata) {
pred_results <- predict(object, new_data = newdata)
return(pred_results$.pred)
}
model_shapley <- function(fit_input,data_input,resp_input){
library(doFuture)
registerDoFuture()
plan(multisession(workers = 25))
## because of the false alarm in plyr running this chunk would give
doRNG::registerDoRNG()
model_shap <- fit_input %>%
fastshap::explain(X = data_input%>%
select(-resp_input)
%>% as.data.frame(),nsim= 1000, pred_wrapper =model_pred_fun
,.parallel=TRUE
)
return(model_shap)
}
gfactor_random_forest_train_data <- enet_rf_valid_CFA$g_second_order$NoID
gfactor_random_forest_train_ID <- enet_rf_valid_CFA$g_second_order$ID$SUBJECTKEY
gfactor_random_forest_fit <- rf_fit_CFA$g_second_order$model_fit
shapley_gractor <- model_shapley(fit_input= gfactor_random_forest_fit,
data_input = gfactor_random_forest_train_data,
resp_input = CFA_Resp_Var[4])
# The input of this function should have exactly 4 variables
# SUBJECTKEY that is subject ID
# The names of modality ie MID, smri, rsmri, nback, sst and dti
# shapley values
# feature values
shapley_summary_plot <- function(data_input, list_val_input){
shap_plot <- data_input%>%
# filter(var_names %in% vars_keep)%>%
mutate(modality_names = fct_reorder(modality_names, shapley_values,.fun = "max"))%>%
ggplot(aes(x = modality_names, y = shapley_values, color = feature_values)) +
coord_flip(ylim = range(data_input$shapley_values)*1.1) +
geom_hline(yintercept = 0) + # the y-axis beneath
# sina plot:
ggforce::geom_sina(method = "counts", maxwidth = 0.7, alpha = 0.7)+
scale_color_gradient(low="blue", high="red",
breaks=unlist(list_val_input), labels=c('min','med','max'),
guide = guide_colorbar(direction = "horizontal",barwidth = 12, barheight = 0.3))+
theme_bw() +
theme(axis.line.y = element_blank(),
axis.ticks.y = element_blank(), # remove axis line
legend.position="bottom",
legend.direction = 'horizontal',
legend.title=element_text(size=20),
legend.text=element_text(size=20),
legend.box = "horizontal",
axis.title.x= element_text(size = 20),
axis.text.y = element_text(size = 20),
axis.text.x = element_text(size = 20)) +
labs(y = "SHAP value (impact on model output)", x = "", color = "Feature value ")
return(shap_plot)
}
shapley_plot_list <- function(shapley_input, data_input, ID_input){
##processing shapley output
shapley_x_wider <- shapley_input%>%tibble::as_tibble()%>%
dplyr::select(ends_with(".x"))%>%
rename(MID = MID_pred.x,rs_fMRI= rsmri_pred.x, SST= SST_pred.x,
sMRI= smri_pred.x, NBack = Nback_pred.x, DTI = DTI_pred.x)%>%
mutate(SUBJECTKEY=ID_input)
shapley_x<- shapley_x_wider%>%
pivot_longer(-SUBJECTKEY, names_to = "modality_names", values_to = "shapley_values")
data_x <- data_input%>%tibble::as_tibble()%>%
dplyr::select(ends_with(".x"))%>%
rename(MID = MID_pred.x,rs_fMRI= rsmri_pred.x, SST= SST_pred.x,
sMRI= smri_pred.x, NBack = Nback_pred.x, DTI = DTI_pred.x)%>%
mutate(SUBJECTKEY=ID_input)%>%
pivot_longer(-SUBJECTKEY, names_to = "modality_names", values_to = "feature_values")
shapley_y_wider <- shapley_input%>%tibble::as_tibble()%>%
dplyr::select(ends_with(".y"))%>%
rename(MID = MID_pred.y,rs_fMRI= rsmri_pred.y, SST= SST_pred.y,
sMRI= smri_pred.y, NBack = Nback_pred.y, DTI = DTI_pred.y)%>%
mutate(SUBJECTKEY=ID_input)
shapley_y <- shapley_y_wider%>%
pivot_longer(-SUBJECTKEY, names_to = "modality_names", values_to = "shapley_values")
data_y <- data_input%>%tibble::as_tibble()%>%
dplyr::select(ends_with(".y"))%>%
rename(MID = MID_pred.y,rs_fMRI= rsmri_pred.y, SST= SST_pred.y,
sMRI= smri_pred.y, NBack = Nback_pred.y, DTI = DTI_pred.y)%>%
mutate(SUBJECTKEY=ID_input)%>%
pivot_longer(-SUBJECTKEY, names_to = "modality_names", values_to = "feature_values")
shapley_all <- bind_rows(shapley_x,shapley_y)
data_all <- bind_rows(data_x,data_y)
shapley_summary_tibble <- plyr::join_all(list(shapley_all,data_all),
by=c("SUBJECTKEY","modality_names"),type="full")
##shapley summary plot
shapley_summary_tibble_NA <- shapley_summary_tibble %>%
naniar::replace_with_na(replace = list(feature_values = c(-1000,1000.000000000)))
shapley_summary_tibble_0 <- shapley_summary_tibble
shapley_summary_tibble_0$feature_values[which(shapley_summary_tibble_0$feature_values == 1000)] <- 0
shapley_summary_tibble_0$feature_values[which(shapley_summary_tibble_0$feature_values == -1000)] <- 0
# list of functions to calculate the values where you want your breaks
myfuns <- list(min, median, max)
# use this list to make a list of your breaks
list_val <- lapply(myfuns, function(f) f(shapley_summary_tibble$feature_values))
list_val_0 <- lapply(myfuns, function(f) f(shapley_summary_tibble_0$feature_values))
shapley_summary_plot_1k <- shapley_summary_plot(data_input = shapley_summary_tibble,
list_val_input = list_val)
shapley_summary_plot_na <- shapley_summary_plot(data_input = shapley_summary_tibble_NA,
list_val_input = list_val_0)
shapley_vi <- bind_rows(shapley_x_wider,shapley_y_wider)%>%
select(-"SUBJECTKEY") %>%
abs()%>% colSums()
shapley_vi_tibble <- tibble(modality_names = names(shapley_vi),
shapley_vi = shapley_vi)
shapley_vi_plot <- shapley_vi_tibble %>% mutate(modality_names = fct_reorder(modality_names, shapley_vi,.fun = "max"))%>%
ggplot(aes(x = modality_names, y = shapley_vi))+
geom_bar(stat = "identity")+
coord_flip()+
labs(y = "|SHAP| variable importance", x = "") +
theme(axis.title.x= element_text(size = 20),
axis.title.y= element_text(size = 20),
axis.text.y = element_text(size = 20),
axis.text.x = element_text(size = 20))
return(list(summary_plot_na = shapley_summary_plot_na,
summary_plot_1k = shapley_summary_plot_1k,
importance_plot = shapley_vi_plot))
}
shapley_plot_list(shapley_input = shapley_gractor,
data_input = gfactor_random_forest_train_data,
ID_input = gfactor_random_forest_train_ID)
## Registered S3 method overwritten by 'ggforce':
## method from
## scale_type.units units
## $summary_plot_na
##
## $summary_plot_1k
##
## $importance_plot
random_forest_plotting_followup(rf_fit =rf_fit_CFA,
followup_data=enet_rf_test_followup_CFA,
naming_frame = CFA_var_plotting,
resp_vec= CFA_Resp_Var)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
random_forest_plotting_baseline(rf_fit =rf_fit_CFA,
baseline_data=enet_rf_test_baseline_CFA,
naming_frame = CFA_var_plotting,
resp_vec= CFA_Resp_Var)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
modality_ploting <- c("Stacked_All","Stacked_Complete","Stacked_Best", "Stacked_NoBest", "NBack","MID", "SST","rs_fMRI","sMRI","DTI")
resp_plotting <- c("g_second_order")
predictedObserved_GfactBaseline <-
baseline_CFA %>% select(SUBJECTKEY, g_second_order) %>%
# enet_rf_test_baseline_CFA$g_second_order$ID %>% #observed:
# select(SUBJECTKEY, g_second_order) %>%
full_join(rf_fit_CFA$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
rename("Stacked_All"= pred_baseline) %>%
full_join(enet_CFA$Nback_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
rename("NBack" = pred) %>%
full_join(enet_CFA$SST_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
rename("SST" = pred) %>%
full_join(enet_CFA$MID_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
rename("MID" = pred) %>%
full_join(enet_CFA$rsmri_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
rename("rs_fMRI" = pred) %>%
full_join(enet_CFA$smri_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
rename("sMRI" = pred) %>%
full_join(enet_CFA$DTI_results$g_second_order$predicted_baseline, by = "SUBJECTKEY") %>%
rename("DTI" = pred)
#create a column for list wise deletion and apply z score
predictedObserved_GfactBaselineZ <- predictedObserved_GfactBaseline %>%
na.omit() %>%
select(SUBJECTKEY, "Stacked_All") %>%
rename("Stacked_Complete" = "Stacked_All") %>%
right_join(predictedObserved_GfactBaseline, by = "SUBJECTKEY")
#%>%
# mutate_at(resp_plotting, ~(scale(.) %>% as.vector))
#create 1) stacked bast = a stacked column with the same subjects with nback
#create 2) stacked others = a stacked column without the nback subjects
predictedObserved_GfactBaselineZ <- predictedObserved_GfactBaselineZ %>%
select(SUBJECTKEY, "Stacked_All", "NBack") %>%
drop_na("NBack") %>%
rename("Stacked_Best" = "Stacked_All") %>% select(-NBack) %>%
right_join(predictedObserved_GfactBaselineZ, by = "SUBJECTKEY") %>%
mutate(Stacked_NoBest = Stacked_All)
predictedObserved_GfactBaselineZ[which(!is.na(predictedObserved_GfactBaselineZ$NBack)),
"Stacked_NoBest"] <- NA
predictedObserved_GfactBaselineZ <-
predictedObserved_GfactBaselineZ[, c(1, 4, 5, 3, 2, 12, 6, 7, 8, 9, 10, 11)]
plotGFact_baseline <- list()
for (modalityNum in seq(1,length(modality_ploting),1))
{
plotGFact_baseline[[modalityNum]] <-
ggplot(data = predictedObserved_GfactBaselineZ,
aes(x = scale(.data[[modality_ploting[modalityNum]]]),
y = scale(.data[['g_second_order']]))) +
geom_jitter(height = 0.1, width = 0.1, size = 1, col = 'grey60') +
geom_smooth(method = 'lm', se = FALSE, col = 'black') +
labs(x = NULL,
y = NULL,
title = paste0(modality_ploting[modalityNum],
' r=',round(cor(predictedObserved_GfactBaselineZ[modality_ploting[modalityNum]],
predictedObserved_GfactBaselineZ$g_second_order,
use = "pairwise.complete.obs"), 3))) +
# "\nMAE=", round(
# yardstick::mae_vec(truth = as.numeric(unlist(predictedObserved_GfactBaselineZ[modality_ploting[modalityNum]])),
# predictedObserved_GfactBaselineZ$g_second_order,
# na_rm = TRUE),3))) +
theme(text = element_text(size=16))
}
figure <-
ggpubr::ggarrange(ggpubr::ggarrange(plotlist=plotGFact_baseline[1:4]),
ggpubr::ggarrange(plotlist=plotGFact_baseline[5:10]),
nrow = 2)
ggpubr::annotate_figure(figure,
top = ggpubr::text_grob("Ability to Predict G-Factor at Baseline", face = "bold", size = 29),
bottom = ggpubr::text_grob("Predicted G-Factor (Z)", size = 30),
left = ggpubr::text_grob("Observed G-Factor (Z)", size = 30, rot = 90)
)
naniar::vis_miss(predictedObserved_GfactBaselineZ %>% select(-SUBJECTKEY,-g_second_order)) +
guides(fill = ggplot2::guide_legend(reverse = TRUE)) +
theme(
axis.text.x=element_text(angle =- 90,hjust = 1),
text = element_text(size=18))
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
colSums(!is.na(predictedObserved_GfactBaselineZ))
## SUBJECTKEY g_second_order Stacked_All Stacked_Complete
## 5487 5487 5171 1164
## Stacked_Best Stacked_NoBest NBack SST
## 2653 2518 2653 2791
## MID rs_fMRI sMRI DTI
## 3138 3839 4567 4071
predictedObserved_Gfactfollowup <-
followup_CFA %>% select(SUBJECTKEY, g_second_order) %>%
# enet_rf_test_baseline_CFA$g_second_order$ID %>% #observed:
# select(SUBJECTKEY, g_second_order) %>%
full_join(rf_fit_CFA$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
rename("Stacked_All"= pred_followup) %>%
full_join(enet_CFA$Nback_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
rename("NBack" = pred) %>%
full_join(enet_CFA$SST_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
rename("SST" = pred) %>%
full_join(enet_CFA$MID_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
rename("MID" = pred) %>%
full_join(enet_CFA$rsmri_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
rename("rs_fMRI" = pred) %>%
full_join(enet_CFA$smri_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
rename("sMRI" = pred) %>%
full_join(enet_CFA$DTI_results$g_second_order$predicted_followup, by = "SUBJECTKEY") %>%
rename("DTI" = pred)
#create a column for list wise deletion and apply z score
predictedObserved_GfactfollowupZ <- predictedObserved_Gfactfollowup %>%
na.omit() %>%
select(SUBJECTKEY, "Stacked_All") %>%
rename("Stacked_Complete" = "Stacked_All") %>%
right_join(predictedObserved_Gfactfollowup, by = "SUBJECTKEY")
#%>%
# mutate_at(resp_plotting, ~(scale(.) %>% as.vector))
#create 1) stacked bast = a stacked column with the same subjects with nback
#create 2) stacked others = a stacked column without the nback subjects
predictedObserved_GfactfollowupZ <- predictedObserved_GfactfollowupZ %>%
select(SUBJECTKEY, "Stacked_All", "NBack") %>%
drop_na("NBack") %>%
rename("Stacked_Best" = "Stacked_All") %>% select(-NBack) %>%
right_join(predictedObserved_GfactfollowupZ, by = "SUBJECTKEY") %>%
mutate(Stacked_NoBest = Stacked_All)
predictedObserved_GfactfollowupZ[which(!is.na(predictedObserved_GfactfollowupZ$NBack)),
"Stacked_NoBest"] <- NA
predictedObserved_GfactfollowupZ <-
predictedObserved_GfactfollowupZ[, c(1, 4, 5, 3, 2, 12, 6, 7, 8, 9, 10, 11)]
plotGFact_followup <- list()
for (modalityNum in seq(1,length(modality_ploting),1))
{
plotGFact_followup[[modalityNum]] <-
ggplot(data = predictedObserved_GfactfollowupZ,
aes(x = scale(.data[[modality_ploting[modalityNum]]]) ,
y =scale(.data[['g_second_order']]) )) +
geom_jitter(height = 0.1, width = 0.1, size = 1, col = 'grey60') +
geom_smooth(method = 'lm', se = FALSE, col = 'black') +
labs(x = NULL,
y = NULL,
title = paste0(modality_ploting[modalityNum],
' r=',round(cor(predictedObserved_GfactfollowupZ[modality_ploting[modalityNum]],
predictedObserved_GfactfollowupZ$g_second_order,
use = "pairwise.complete.obs"), 3)))+
# "\nMAE=", round(
# yardstick::mae_vec(truth = as.numeric(unlist(predictedObserved_GfactfollowupZ[modality_ploting[modalityNum]])),
# predictedObserved_GfactfollowupZ$g_second_order,
# na_rm = TRUE),3))) +
theme(text = element_text(size=16))
}
figure <-
ggpubr::ggarrange(ggpubr::ggarrange(plotlist=plotGFact_followup[1:4]),
ggpubr::ggarrange(plotlist=plotGFact_followup[5:10]),
nrow = 2)
ggpubr::annotate_figure(figure,
top = ggpubr::text_grob("Ability to Predict G-Factor at Follow-up", face = "bold", size = 29),
bottom = ggpubr::text_grob("Predicted G-Factor (Z)", size = 30),
left = ggpubr::text_grob("Observed G-Factor (Z)", size = 30, rot = 90)
)
naniar::vis_miss(predictedObserved_GfactfollowupZ %>% select(-SUBJECTKEY,-g_second_order)) +
guides(fill = ggplot2::guide_legend(reverse = TRUE)) +
theme(
axis.text.x=element_text(angle =- 90,hjust = 1),
text = element_text(size=20))
colSums(!is.na(predictedObserved_GfactfollowupZ))
## SUBJECTKEY g_second_order Stacked_All Stacked_Complete
## 5280 5280 5119 1374
## Stacked_Best Stacked_NoBest NBack SST
## 2996 2123 2996 2903
## MID rs_fMRI sMRI DTI
## 3127 4137 4569 4230
predictedObserved_GfactBaselineZ_missing <-
predictedObserved_GfactBaselineZ %>%
mutate(Stacked_All_missing = is.na(Stacked_All)) %>%
mutate(Stacked_Complete_missing = is.na(Stacked_Complete)) %>%
mutate(Stacked_Best_missing = is.na(Stacked_Best)) %>%
mutate(Stacked_NoBest_missing = is.na(Stacked_NoBest)) %>%
mutate(NBack_missing = is.na(NBack)) %>%
mutate(SST_missing = is.na(SST)) %>%
mutate(MID_missing = is.na(MID)) %>%
mutate(rs_fMRI_missing = is.na(rs_fMRI)) %>%
mutate(sMRI_missing = is.na(sMRI)) %>%
mutate(DTI_missing = is.na(DTI))
predictedObserved_GfactfollowupZ_missing <-
predictedObserved_GfactfollowupZ %>%
mutate(Stacked_All_missing = is.na(Stacked_All)) %>%
mutate(Stacked_Complete_missing = is.na(Stacked_Complete)) %>%
mutate(Stacked_Best_missing = is.na(Stacked_Best)) %>%
mutate(Stacked_NoBest_missing = is.na(Stacked_NoBest)) %>%
mutate(NBack_missing = is.na(NBack)) %>%
mutate(SST_missing = is.na(SST)) %>%
mutate(MID_missing = is.na(MID)) %>%
mutate(rs_fMRI_missing = is.na(rs_fMRI)) %>%
mutate(sMRI_missing = is.na(sMRI)) %>%
mutate(DTI_missing = is.na(DTI))
ttest_missing_plot <- function(missing_colume_name, missing_title, missing_response, input_df){
ttest_formula <- as.formula(paste(missing_response, paste(missing_colume_name, collapse=" + "), sep=" ~ "))
ttest_missing_gg <- input_df %>%
ggplot(aes_string(y=missing_response, x= missing_colume_name))+
geom_violin(trim = FALSE) +
stat_summary(
fun.data = "mean_sdl", fun.args = list(mult = 2),
geom = "pointrange", color = "black"
) +
scale_x_discrete(labels=c("Present", "Missing")) +
ggpubr::stat_pvalue_manual(
predictedObserved_GfactBaselineZ_missing %>%
rstatix::t_test(ttest_formula) %>%
rstatix::add_significance() %>%
rstatix::add_xy_position(x = missing_colume_name)) +
ggtitle(paste0(missing_title, "\nCohen d = ",
rstatix::cohens_d(data = input_df,
formula = ttest_formula,
var.equal = FALSE)%>% .$effsize %>% round(digits =3)))+
theme(axis.title.x=element_blank()) +
theme(axis.title.y=element_blank())
return(ttest_missing_gg)
}
missing_vars <- predictedObserved_GfactBaselineZ_missing %>%
select(ends_with("missing")) %>%
colnames() %>%
as.list()
missing_names <- predictedObserved_GfactBaselineZ_missing %>%
select(ends_with("missing")) %>%
colnames() %>%
str_replace("_missing", "") %>%
as.list()
missing_response <- replicate(length(missing_names), "g_second_order") %>% as.list()
missing_input_baseline_df <- replicate(length(missing_names), predictedObserved_GfactBaselineZ_missing, simplify = FALSE)
missing_input_followUp_df <- replicate(length(missing_names), predictedObserved_GfactfollowupZ_missing, simplify = FALSE)
ttest_missing_plot_baseline_list <- purrr::pmap(list(missing_colume_name = missing_vars,
missing_title = missing_names,
missing_response = missing_response,
input_df = missing_input_baseline_df),
ttest_missing_plot)
title_baseline.grob <- textGrob("Baseline Test",
gp=gpar(fontsize=18))
ttest_missing_plot_baseline_gg <-
cowplot::plot_grid(plotlist=ttest_missing_plot_baseline_list,
nrow =5, align='vh', vjust=1, scale = 1)
ttest_missing_plot_baseline_gg_title <- gridExtra::grid.arrange(gridExtra::arrangeGrob(ttest_missing_plot_baseline_gg,
top = title_baseline.grob))
ttest_missing_plot_followup_list <- purrr::pmap(list(missing_colume_name = missing_vars,
missing_title = missing_names,
missing_response = missing_response,
input_df = missing_input_followUp_df),
ttest_missing_plot)
title_followup.grob <- textGrob("Follow Up Test",
gp=gpar(fontsize=18))
ttest_missing_plot_followup_gg <-
cowplot::plot_grid(plotlist=ttest_missing_plot_followup_list,
nrow =5, align='vh', vjust=1, scale = 1)
ttest_missing_plot_followup_gg_title <- gridExtra::grid.arrange(gridExtra::arrangeGrob(ttest_missing_plot_followup_gg,
top = title_followup.grob))
ttest_missing_plot_both_gg <- cowplot::plot_grid(ttest_missing_plot_baseline_gg_title,
ttest_missing_plot_followup_gg_title,
nrow =1)
title_overall.grob <- textGrob(expression(paste("Differences in ",italic("g"), "-Factor(Z) As a Function of Missing Data")),
gp=gpar(fontsize=20))
y_overall.grob <- textGrob(expression(paste(italic("g"), "-factor(Z)")),
gp=gpar(fontsize=18), rot = 90)
ttest_missing_plot_both_gg_title <- gridExtra::grid.arrange(gridExtra::arrangeGrob(ttest_missing_plot_both_gg,
top = title_overall.grob,
left = y_overall.grob))
get_summary_stats <- function(data_input, target_input){
data_input <- data_input %>%
mutate(pred_target = as.numeric(scale(data_input$g_second_order)) )
mae_perform <- yardstick::mae(data =data_input,
truth=pred_target, estimate=data_input[[target_input]])
rmse_perform <- yardstick::rmse(data =data_input,
truth=pred_target, estimate=data_input[[target_input]])
cor_perform <- cor(data_input$g_second_order,
data_input[[target_input]],
use = "pairwise.complete.obs")
tradrsq_perform <- yardstick::rsq_trad(data = data_input,
truth=pred_target,
estimate=data_input[[target_input]])
return( tibble(Models=target_input,
Pearson_r=round(cor_perform,3),
R_squared=round(tradrsq_perform$.estimate,3),
MAE=round(mae_perform$.estimate,3),
RMSE=round(rmse_perform$.estimate,3)
) )
}
elem_remove <- c("SUBJECTKEY","g_second_order")
baseline_pred_vec <- setdiff(colnames(predictedObserved_GfactBaselineZ),elem_remove)
baseline_perform <- baseline_pred_vec %>%
map(.,~get_summary_stats(data_input = predictedObserved_GfactBaselineZ,
target_input = .))%>%
do.call(rbind,.)%>%
kableExtra::kbl(caption = paste0("Predictive Performance for the baseline test set")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria") %>%
print()
Models | Pearson_r | R_squared | MAE | RMSE |
---|---|---|---|---|
Stacked_All | 0.439 | 0.191 | 0.699 | 0.895 |
Stacked_Complete | 0.429 | 0.183 | 0.610 | 0.780 |
Stacked_Best | 0.442 | 0.195 | 0.620 | 0.798 |
Stacked_NoBest | 0.296 | 0.085 | 0.783 | 0.987 |
NBack | 0.402 | 0.072 | 0.664 | 0.857 |
SST | 0.129 | -0.033 | 0.744 | 0.950 |
MID | 0.202 | 0.013 | 0.738 | 0.944 |
rs_fMRI | 0.233 | 0.042 | 0.749 | 0.955 |
sMRI | 0.208 | 0.040 | 0.763 | 0.969 |
DTI | 0.190 | 0.033 | 0.757 | 0.972 |
followup_pred_vec <- setdiff(colnames(predictedObserved_GfactfollowupZ)
,elem_remove)
followup_perform <- followup_pred_vec %>%
map(.,~get_summary_stats(data_input = predictedObserved_GfactfollowupZ,
target_input = .))%>%
do.call(rbind,.)%>%
kableExtra::kbl(caption = paste0("Predictive Performance for the followup test set")) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria") %>%
print()
Models | Pearson_r | R_squared | MAE | RMSE |
---|---|---|---|---|
Stacked_All | 0.414 | 0.166 | 0.719 | 0.913 |
Stacked_Complete | 0.427 | 0.168 | 0.651 | 0.829 |
Stacked_Best | 0.438 | 0.175 | 0.666 | 0.846 |
Stacked_NoBest | 0.317 | 0.100 | 0.794 | 1.000 |
NBack | 0.383 | 0.118 | 0.687 | 0.875 |
SST | 0.145 | -0.004 | 0.760 | 0.961 |
MID | 0.148 | -0.003 | 0.757 | 0.955 |
rs_fMRI | 0.251 | 0.055 | 0.754 | 0.954 |
sMRI | 0.226 | 0.049 | 0.764 | 0.965 |
DTI | 0.212 | 0.045 | 0.771 | 0.978 |
perfDiff = function(data, indices){
sample = data[indices,]
cor_Stack <- cor(sample$g_second_order,
sample$Stacked_Best,
use = "pairwise.complete.obs")
cor_NBack <- cor(sample$g_second_order,
sample$NBack,
use = "pairwise.complete.obs")
tradrsq_Stack <- yardstick::rsq_trad(data =sample,
truth=g_second_order,
estimate=Stacked_Best)
tradrsq_NBack <- yardstick::rsq_trad(data =sample,
truth=g_second_order,
estimate=NBack)
mae_Stack <- yardstick::mae(data =sample,
truth=g_second_order,
estimate=Stacked_Best)
mae_NBack <- yardstick::mae(data =sample,
truth=g_second_order,
estimate=NBack)
rmse_Stack <- yardstick::rmse(data =sample,
truth=g_second_order,
estimate=Stacked_Best)
rmse_NBack <- yardstick::rmse(data =sample,
truth=g_second_order,
estimate=NBack)
cor_StackVSNBack = cor_Stack - cor_NBack
tradrsq_StackVSNBack = tradrsq_Stack$.estimate - tradrsq_NBack$.estimate
mae_StackVSNBack = mae_Stack$.estimate - mae_NBack$.estimate
rmse_StackVSNBack = rmse_Stack$.estimate - rmse_NBack$.estimate
StackVSNBack = c(cor_StackVSNBack, tradrsq_StackVSNBack, mae_StackVSNBack, rmse_StackVSNBack)
return(StackVSNBack)
}
StackBestVSBaseline <- predictedObserved_GfactBaselineZ %>%
select(g_second_order,Stacked_Best,NBack) %>% na.omit()
set.seed(12345)
DiffResultsBaseline <- boot::boot(data = StackBestVSBaseline,
statistic = perfDiff,
R = 5000)
#"Bias Corrected and Accelerated"
ci_r_baseline <- boot::boot.ci(DiffResultsBaseline, index = 1, conf = 0.95, type = 'bca')
ci_rsq_baseline <-boot::boot.ci(DiffResultsBaseline, index = 2, conf = 0.95, type = 'bca')
ci_mae_baseline <- boot::boot.ci(DiffResultsBaseline, index = 3, conf = 0.95, type = 'bca')
ci_rmse_baseline <- boot::boot.ci(DiffResultsBaseline, index = 4, conf = 0.95, type = 'bca')
ci_r_baseline
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
##
## CALL :
## boot::boot.ci(boot.out = DiffResultsBaseline, conf = 0.95, type = "bca",
## index = 1)
##
## Intervals :
## Level BCa
## 95% ( 0.0206, 0.0585 )
## Calculations and Intervals on Original Scale
ci_rsq_baseline
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
##
## CALL :
## boot::boot.ci(boot.out = DiffResultsBaseline, conf = 0.95, type = "bca",
## index = 2)
##
## Intervals :
## Level BCa
## 95% ( 0.0727, 0.1393 )
## Calculations and Intervals on Original Scale
ci_mae_baseline
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
##
## CALL :
## boot::boot.ci(boot.out = DiffResultsBaseline, conf = 0.95, type = "bca",
## index = 3)
##
## Intervals :
## Level BCa
## 95% (-0.0396, -0.0157 )
## Calculations and Intervals on Original Scale
ci_rmse_baseline
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
##
## CALL :
## boot::boot.ci(boot.out = DiffResultsBaseline, conf = 0.95, type = "bca",
## index = 4)
##
## Intervals :
## Level BCa
## 95% (-0.0516, -0.0270 )
## Calculations and Intervals on Original Scale
StackBestVSfollowup <- predictedObserved_GfactfollowupZ %>%
select(g_second_order,Stacked_Best,NBack) %>% na.omit()
set.seed(12345)
DiffResultsfollowup <- boot::boot(data = StackBestVSfollowup,
statistic = perfDiff,
R = 5000)
#"Bias Corrected and Accelerated"
ci_r_followup <- boot::boot.ci(DiffResultsfollowup, index = 1, conf = 0.95, type = 'bca')
ci_rsq_followup <-boot::boot.ci(DiffResultsfollowup, index = 2, conf = 0.95, type = 'bca')
ci_mae_followup <- boot::boot.ci(DiffResultsfollowup, index = 3, conf = 0.95, type = 'bca')
ci_rmse_followup <- boot::boot.ci(DiffResultsfollowup, index = 4, conf = 0.95, type = 'bca')
ci_r_followup
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
##
## CALL :
## boot::boot.ci(boot.out = DiffResultsfollowup, conf = 0.95, type = "bca",
## index = 1)
##
## Intervals :
## Level BCa
## 95% ( 0.0359, 0.0732 )
## Calculations and Intervals on Original Scale
ci_rsq_followup
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
##
## CALL :
## boot::boot.ci(boot.out = DiffResultsfollowup, conf = 0.95, type = "bca",
## index = 2)
##
## Intervals :
## Level BCa
## 95% ( 0.0126, 0.0745 )
## Calculations and Intervals on Original Scale
ci_mae_followup
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
##
## CALL :
## boot::boot.ci(boot.out = DiffResultsfollowup, conf = 0.95, type = "bca",
## index = 3)
##
## Intervals :
## Level BCa
## 95% (-0.0211, 0.0020 )
## Calculations and Intervals on Original Scale
ci_rmse_followup
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 5000 bootstrap replicates
##
## CALL :
## boot::boot.ci(boot.out = DiffResultsfollowup, conf = 0.95, type = "bca",
## index = 4)
##
## Intervals :
## Level BCa
## 95% (-0.0294, -0.0050 )
## Calculations and Intervals on Original Scale
r_plot_baseline <- as_tibble(DiffResultsBaseline$t[,1]) %>%
ggplot() + geom_density(aes(x = value)) +
ggtitle("Pearson's r") +
geom_vline(aes(xintercept=ci_r_baseline$bca[4]), color="red", linetype="dashed") +
geom_vline(aes(xintercept=ci_r_baseline$bca[5]), color="red", linetype="dashed") +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
text = element_text(size=20))
rsq_plot_baseline <- as_tibble(DiffResultsBaseline$t[,2]) %>%
ggplot() + geom_density(aes(x = value)) +
ggtitle("R squared") +
geom_vline(aes(xintercept=ci_rsq_baseline$bca[4]), color="red", linetype="dashed") +
geom_vline(aes(xintercept=ci_rsq_baseline$bca[5]), color="red", linetype="dashed") +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
text = element_text(size=20))
mae_plot_baseline <- as_tibble(DiffResultsBaseline$t[,3]) %>%
ggplot() + geom_density(aes(x = value)) +
ggtitle("MAE") +
geom_vline(aes(xintercept=ci_mae_baseline$bca[4]), color="red", linetype="dashed") +
geom_vline(aes(xintercept=ci_mae_baseline$bca[5]), color="red", linetype="dashed") +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
text = element_text(size=20))
rmse_plot_baseline <- as_tibble(DiffResultsBaseline$t[,4]) %>%
ggplot() + geom_density(aes(x = value)) +
ggtitle("RMSE") +
geom_vline(aes(xintercept=ci_rmse_baseline$bca[4]), color="red", linetype="dashed") +
geom_vline(aes(xintercept=ci_rmse_baseline$bca[5]), color="red", linetype="dashed") +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
text = element_text(size=20))
boot_baseline <-
ggpubr::ggarrange(plotlist=
list(r_plot_baseline,
rsq_plot_baseline,
mae_plot_baseline,
rmse_plot_baseline)) %>%
ggpubr::annotate_figure(top = ggpubr::text_grob("Baseline Test Set",
size=22,
face = "bold"))
r_plot_followup <- as_tibble(DiffResultsfollowup$t[,1]) %>%
ggplot() + geom_density(aes(x = value)) +
ggtitle("Pearson's r") +
geom_vline(aes(xintercept=ci_r_followup$bca[4]), color="red", linetype="dashed") +
geom_vline(aes(xintercept=ci_r_followup$bca[5]), color="red", linetype="dashed") +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
text = element_text(size=20))
rsq_plot_followup <- as_tibble(DiffResultsfollowup$t[,2]) %>%
ggplot() + geom_density(aes(x = value)) +
ggtitle("R squared") +
geom_vline(aes(xintercept=ci_rsq_followup$bca[4]), color="red", linetype="dashed") +
geom_vline(aes(xintercept=ci_rsq_followup$bca[5]), color="red", linetype="dashed") +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
text = element_text(size=20))
mae_plot_followup <- as_tibble(DiffResultsfollowup$t[,3]) %>%
ggplot() + geom_density(aes(x = value)) +
ggtitle("MAE") +
geom_vline(aes(xintercept=ci_mae_followup$bca[4]), color="red", linetype="dashed") +
geom_vline(aes(xintercept=ci_mae_followup$bca[5]), color="red", linetype="dashed") +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
text = element_text(size=20))
rmse_plot_followup <- as_tibble(DiffResultsfollowup$t[,4]) %>%
ggplot() + geom_density(aes(x = value)) +
ggtitle("RMSE") +
geom_vline(aes(xintercept=ci_rmse_followup$bca[4]), color="red", linetype="dashed") +
geom_vline(aes(xintercept=ci_rmse_followup$bca[5]), color="red", linetype="dashed") +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
text = element_text(size=20))
boot_followup <- ggpubr::ggarrange(plotlist=
list(r_plot_followup, rsq_plot_followup, mae_plot_followup, rmse_plot_followup)) %>%
ggpubr::annotate_figure(top = ggpubr::text_grob("Follow-up Test Set",
size=22,
face = "bold"))
ggpubr::ggarrange(boot_baseline, boot_followup,
nrow=2) %>%
ggpubr::annotate_figure(top =
ggpubr::text_grob("Bootstrapped Distribution Stacked Best > N-Back",
size=24,
face = "bold"))
To test if the model generalise across sites, we used the baseline data, fit the model to all but one site and test the results at hold-out site.
After leaving one site as a test, we 50/50 divided the rest into train (i.e,. 1st layer) and validate (i.e., 2nd layer). This process then was repeated until we finished all of held-out sites.
baseline_site <- baseline_analysis %>% drop_na(SITE_ID_L)
unique_site <- length(unique(baseline_site$SITE_ID_L))
site_vfold <- group_vfold_cv(data = baseline_site, group = "SITE_ID_L", v= unique_site)
## the following two lines take the split results and transform it into a data frame
## so that all the results can be checked.
#model_data_1 <- site_vfold$splits[[1]]%>% analysis()
#holdout__data_1 <- site_vfold$splits[[1]] %>% assessment()
all_sample <- site_vfold$id
all_site <- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L)) %>%
do.call(rbind,.)
all_site <- as.vector(all_site)
all_site_number <- str_remove_all(all_site,"site")
###divide the sampled data into train valid and test sets
#after leave one site, the rest was devided into 2: train and validate
train_valid_split <- all_sample %>%
map(.,~vfold_cv(data=analysis(site_vfold$splits[[which(site_vfold$id==.)]]),
v=2,repeats = 1))
names(train_valid_split)<- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
test_site_split <- all_sample %>%
map(.,~assessment(site_vfold$splits[[which(site_vfold$id==.)]]))
names(test_site_split)<- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
Note, for cross site validation, we only focus on the baseline data. We didn’t use the followup data here.
enet_site_nocom <-
function(resp_var,
split_train = analysis( data_two_fold$splits[[1]]),
split_valid = assessment(data_two_fold$splits[[1]]),
split_test=data_fold_test,
contrast_name){
data_train <- split_train%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(contrast_name))%>%
drop_na()%>% IQR_remove()
data_valid <- split_valid%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(contrast_name))%>%
drop_na()%>% IQR_remove()
test_set <- split_test%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(contrast_name))%>%
drop_na()%>% IQR_remove()
ols_data_train <-
select(data_train,
resp_var,
starts_with(contrast_name))
ols_data_valid <-
select(data_valid,
resp_var,
starts_with(contrast_name))
ols_test_set <- select(test_set,
resp_var,
starts_with(contrast_name))
return(list(ols_data_train=ols_data_train,
ols_data_valid=ols_data_valid,
ols_test_set=ols_test_set,
data_valid_id=data_valid$SUBJECTKEY,
test_set_id=test_set$SUBJECTKEY,
data_train_id=data_train$SUBJECTKEY))
}
enet_site_com <- function(resp_var,
split_train = analysis( data_two_fold$splits[[1]]),
split_valid = assessment(data_two_fold$splits[[1]]),
split_test=data_fold_test,
measure_one,
measure_two){
data_train <-split_train%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one),
starts_with(measure_two))%>%
drop_na()%>% IQR_remove()
data_valid <- split_valid%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one),
starts_with(measure_two))%>%
drop_na()%>% IQR_remove()
test_set <- split_test%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one),
starts_with(measure_two))%>%
drop_na()%>% IQR_remove()
ols_data_train <- batch_adjust(data_fold = data_train,
resp = resp_var,
x=measure_one,
y=measure_two)
ols_data_valid <- batch_adjust(data_fold = data_valid,
resp = resp_var,
x=measure_one,
y= measure_two)
ols_test_set <- select(test_set,
resp_var,
starts_with(measure_one),
starts_with(measure_two) )
return(list(ols_data_train=ols_data_train,
ols_data_valid=ols_data_valid,
ols_test_set=ols_test_set,
data_valid_id=data_valid$SUBJECTKEY,
test_set_id=test_set$SUBJECTKEY,
data_train_id=data_train$SUBJECTKEY))
}
# for models that require Combat that has one type of data. so far this was used for DTI only. This is bc DTI only one prefix ("fa")
enet_site_comOne <- function(resp_var,
split_train = analysis( data_two_fold$splits[[1]]),
split_valid = assessment(data_two_fold$splits[[1]]),
split_test=data_fold_test, measure_one){
data_train <-split_train%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one))%>%
drop_na()%>%
IQR_remove()
data_valid <- split_valid%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one))%>%
drop_na()%>%
IQR_remove()
test_set <- split_test%>%
select(all_of(resp_var),
all_of(subj_info),
starts_with(measure_one))%>%
drop_na()%>% IQR_remove()
ols_data_train <- batch_adjustOne(data_fold = data_train,
resp = resp_var, x=measure_one)
ols_data_valid <- batch_adjustOne(data_fold = data_valid,
resp = resp_var, x=measure_one)
ols_test_set <- select(test_set,
resp_var,
starts_with(measure_one))
return(list(ols_data_train=ols_data_train,
ols_data_valid=ols_data_valid,
ols_test_set=ols_test_set,
data_valid_id=data_valid$SUBJECTKEY,
test_set_id=test_set$SUBJECTKEY,
data_train_id=data_train$SUBJECTKEY))
}
enet_tuning_site <-
function(resp_var, data_list){
#recipe train and test data are from ols as the variables are the same
norm_recipe <-recipe(as.formula(paste0(resp_var, "~ .")),
data =data_list[["ols_data_train"]] ) %>%
step_dummy(all_nominal()) %>%
step_center(all_predictors()) %>%
step_scale(all_predictors()) %>%
# estimate the means and standard deviations
prep(training = data_list[["ols_data_train"]], retain = TRUE)
## model
glmn_mod <- linear_reg(penalty = tune() ,mixture =tune()) %>%
set_engine("glmnet")
set.seed(123456)
all_rs <- rsample::vfold_cv(data=data_list[["ols_data_train"]],v = 10,repeats = 1)
glmn_wfl <- workflow()%>%
add_recipe(norm_recipe)%>%
add_model(glmn_mod)
## penalty is lambda, mixture is alpha
## log10 transformed to get lambda between 10^-10 to 10^1
## this expands the default values (10^-10 to 10^0) from tidymodels https://github.com/tidymodels/dials/issues/140
glmn_set <- dials::parameters(penalty(range = c(-10,1), trans = log10_trans()),mixture())
## 100 levels of lambda and 11 levels of alpha (0, .1 to 1) # use 100 level as opposed to 200 to reduce the use ofram
glmn_grid <- grid_regular(glmn_set, levels = c(100,11))
ctrl <- control_grid(save_pred = TRUE, verbose = TRUE)
glmn_tune <-tune::tune_grid(glmn_wfl,
resamples = all_rs,
grid = glmn_grid,
metrics = metric_set(mae),##use traditional r square because there is no correlation
control = ctrl)
##select the best tuned lambda and alpha
best_glmn <- select_best(glmn_tune, metric = "mae")
## use the best tunes parameters to fit the test data
glmn_wfl_final <-
glmn_wfl %>%
finalize_workflow(best_glmn) %>%
parsnip::fit(data = data_list[["ols_data_train"]])
### this is the prediction for random forest tuning
pred_glmn <- predict(glmn_wfl_final, new_data =data_list[["ols_data_valid"]])
###this is the prediction for test set
pred_test <- predict(glmn_wfl_final, new_data =data_list[["ols_test_set"]])
##compute the correlation between predicted results and the test data
performance_glmn <- cor(pred_glmn$.pred,data_list[["ols_data_valid"]][[resp_var]])
performance_test <- cor(pred_test$.pred,data_list[["ols_test_set"]][[resp_var]])
best_glmn <- best_glmn %>% mutate(performance=performance_glmn,test_performance = performance_test)
return(list("param_tune"=best_glmn,
"predicted"=tibble(SUBJECTKEY=data_list[["data_valid_id"]],
"pred"=pred_glmn$.pred),
"predicted_test"=tibble(SUBJECTKEY=data_list[["test_set_id"]],
"pred"=pred_test$.pred),
"obs_train"=length(data_list[["data_train_id"]]),
"obs_valid"=length(data_list[["data_valid_id"]])))
}
rnd_for_data_site = function(pred_select = "predicted",
data_list,site_idx,
resp_chose,
resp_data){
enet_pred <-
plyr::join_all(list(data_list[["MID_site"]][[resp_chose]][[site_idx]][[pred_select]]%>%
dplyr::rename("MID_pred"="pred"),
data_list[["smri_site"]][[resp_chose]][[site_idx]][[pred_select]]%>%
dplyr::rename("smri_pred"="pred"),
data_list[["SST_site"]][[resp_chose]][[site_idx]][[pred_select]]%>%
dplyr::rename("SST_pred"="pred"),
data_list[["rsmri_site"]][[resp_chose]][[site_idx]][[pred_select]]%>%
dplyr::rename("rsmri_pred"="pred"),
data_list[["Nback_site"]][[resp_chose]][[site_idx]][[pred_select]]%>%
dplyr::rename("Nback_pred"="pred"),
data_list[["DTI_site"]][[resp_chose]][[site_idx]][[pred_select]]%>%
dplyr::rename("DTI_pred"="pred")),
by='SUBJECTKEY', type='full') %>%
distinct(SUBJECTKEY, .keep_all = TRUE)
enet_pred_p <- enet_pred%>% mutate_all(~replace(., is.na(.), 1000))
enet_pred_n <- enet_pred%>% mutate_all(~replace(., is.na(.), -1000))
enet_pred_noNA <- left_join(enet_pred_p,
enet_pred_n,
by="SUBJECTKEY")
resp_data <- resp_data %>%
select("SUBJECTKEY",resp_chose)%>%
distinct(SUBJECTKEY, .keep_all = TRUE) %>%
drop_na()%>%
mutate_if(is.double, ~ (.x - mean(.x)) / sd(.x))
enet_pred_noNA <- left_join(enet_pred_noNA,resp_data,by="SUBJECTKEY")
enet_pred_noNA_noID <- select(enet_pred_noNA,-SUBJECTKEY)
return(list("ID"=enet_pred_noNA,"NoID"=enet_pred_noNA_noID))
}
random_forest_site <- function(resp,valid_list,test_list,site_chose){
### to avoid repetition of coding a list is created with one data frame with ID and another without
rf_train_data <- valid_list[[resp]][[site_chose]]
rf_test_data <- test_list[[resp]][[site_chose]]
resp_var <- resp
##data recipe, assign "SUBJECTKEY" to ID (a class of variables neither belong to predictors nor response variable)
## do not scale the input data as 1000 and -1000 are put there on purpose
forest_rec <- recipe(as.formula(paste0(resp_var, "~ .")), data = rf_train_data[["NoID"]]) %>%
step_dummy(all_nominal()) %>%
prep(training = rf_train_data[["NoID"]], retain = TRUE)
## mtry is the number of predictors to sample at each split
## min_n (the number of observations needed to keep splitting nodes)
tune_spec <- rand_forest(
mtry = tune(),
trees = 1000,
min_n = tune()
) %>%
set_mode("regression") %>%
set_engine("ranger")
tune_wf <- workflow() %>%
add_recipe(forest_rec) %>%
add_model(tune_spec)
set.seed(123456)
## nested data validation
all_rs <- rsample::vfold_cv(data=rf_train_data[["ID"]],v = 10,repeats = 1)
## automate generate grid for hyperparameters
rf_grid <- grid_regular(
mtry(range = c(1, 12)), #12 is the highest number of predictors
min_n(range = c(1, 500)), #500 is enough based on earlier analyses
levels = c(mtry = 12, min_n = 51) #mtry = 10 so that it increases one step at a time
)
## dofuture package is imported by package furrr
doParallel::registerDoParallel()
future::plan(multisession,workers= all_cores)
tune_res <- tune_grid(
tune_wf,
resamples = all_rs,
metrics = metric_set(rmse),
grid = rf_grid
)
best_tune <- select_best(tune_res, metric = "rmse")
final_rf <- finalize_model(
tune_spec,
best_tune
)
forest_prep <- prep(forest_rec)
## this one fit the random forest model with the best tuned parameters
rf_fit <- final_rf %>%
set_engine("ranger") %>%
parsnip::fit(as.formula(paste0(resp_var, "~ .")),
data = juice(forest_prep)
)
pred_rf <- predict(rf_fit,new_data = rf_test_data[["NoID"]])
performance_rf <- cor(pred_rf$.pred,rf_test_data[["ID"]][[resp]])
best_tune <- best_tune %>% mutate(performance=performance_rf)
result_list <- list("param_tune"=best_tune, "predicted"=tibble(SUBJECTKEY=rf_test_data[["ID"]]$SUBJECTKEY, "pred"=pred_rf$.pred),"model_fit"=rf_fit)
return(result_list)
}
change_modality_names <- function(performance_frame){
performance_frame$modality[which(performance_frame$modality=="rsmri")]<- "rs_fMRI"
performance_frame$modality[which(performance_frame$modality=="smri")]<- "sMRI"
performance_frame$modality[which(performance_frame$modality=="random_forest")]<- "Stacked"
performance_frame$modality[which(performance_frame$modality=="Nback")]<- "NBack"
performance_frame <- performance_frame%>%
mutate(modality= as.factor(modality))%>%
mutate(modality = factor(modality,levels =c ("Stacked","NBack","MID", "SST","rs_fMRI","sMRI","DTI")))
}
CFA_lav_response_site <- function(data_input = baseline_two_fold$splits[[1]], test_baseline =baseline_test){
## training the model and for enet parameter tuning
data_train <- analysis(data_input)%>%
select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
## random forest tuning
data_valid <- assessment(data_input)%>%
select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
## test set for baseline observations
data_baseline <- test_baseline%>%
select(all_of(subj_info),all_of(TaskDVs1Batch))%>% drop_na()%>% IQR_remove()
### first order CFA model
NeuroCog1stOrder <-'
Language =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED
CognitiveFlexibity =~ NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED
MemoryRecall =~ NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC'
NeuroCog1stOrder.Fit <- lavaan::cfa(model = NeuroCog1stOrder, data = data_train,estimator="MLR")
### second order CFA model
NeuroCog2ndOrder <-'
Language =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED
CognitiveFlexibity =~ NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED
MemoryRecall =~ NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC
g =~ NA*Language + CognitiveFlexibity + MemoryRecall #estimate the loading of GenAbi -> as opposed to using it as a marker
g ~~ 1*g #need to constrain variance to 1'
NeuroCog2ndOrder.Fit <- lavaan::cfa(model = NeuroCog2ndOrder, data = data_train,estimator="MLR")
NeuroCogSingleFactor <-'
g =~ NIHTBX_PICVOCAB_UNCORRECTED + NIHTBX_READING_UNCORRECTED + NIHTBX_FLANKER_UNCORRECTED + NIHTBX_PATTERN_UNCORRECTED + NIHTBX_PICTURE_UNCORRECTED + PEA_RAVLT_LD_TRIAL_VII_TC'
NeuroCogSingleFactor.Fit <- lavaan::cfa(model = NeuroCogSingleFactor, data = data_train,estimator="MLR")
train_CFA_lav <- CFA_lav_output_processing(data_input = data_train,
CFA_model1 = NeuroCog1stOrder.Fit,
CFA_model2 = NeuroCog2ndOrder.Fit,
CFA_model3 = NeuroCogSingleFactor.Fit)
valid_CFA_lav <- CFA_lav_output_processing(data_input = data_valid,
CFA_model1 = NeuroCog1stOrder.Fit,
CFA_model2 = NeuroCog2ndOrder.Fit,
CFA_model3 = NeuroCogSingleFactor.Fit)
baseline_CFA_lav <- CFA_lav_output_processing(data_input = data_baseline,
CFA_model1 = NeuroCog1stOrder.Fit,
CFA_model2 = NeuroCog2ndOrder.Fit,
CFA_model3 = NeuroCogSingleFactor.Fit)
output_list <- list(enet_tuning = train_CFA_lav, RanFor_tuning = valid_CFA_lav, testing_baseline = baseline_CFA_lav)
return(list(output_list, NeuroCog1stOrder.Fit, NeuroCog2ndOrder.Fit,NeuroCogSingleFactor.Fit))
}
CFA_list_site <- all_site %>% map(.,~CFA_lav_response_site(data_input = train_valid_split[[.]][["splits"]][[1]],
test_baseline =test_site_split[[.]]))
names(CFA_list_site) <- all_site
CFA_resp_site <- c("g_second_order" )
Nback_cross_site_CFA <- function(resp_chosen){
# doParallel::registerDoParallel()
### map over all sites instead of one single response variable
Nback_site_data <- all_site %>%
map(.,~enet_site_nocom(resp_var = resp_chosen,
split_train = left_join(analysis(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["enet_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["RanFor_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_test = left_join(test_site_split[[.]],
CFA_list_site[[.]][[1]][["testing_baseline"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
contrast_name = "X2backvs0back_ROI_"))
names(Nback_site_data) <- all_sample %>%
future_map (.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L),
.options = furrr_options(seed=TRUE),
.progress = TRUE)
Nback_site <- all_site %>%
map(.,~enet_tuning_site(resp_var =resp_chosen,
data_list = Nback_site_data[[.]]))
names(Nback_site)<- all_sample %>%
map (.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
return(Nback_site)
}
SST_cross_site_CFA <- function(resp_chosen){
# doParallel::registerDoParallel()
### map over all sites instead of one single response variable
SST_site_data <- all_site %>%
map(.,~enet_site_nocom(resp_var = resp_chosen,
split_train = left_join(analysis(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["enet_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["RanFor_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_test = left_join(test_site_split[[.]],
CFA_list_site[[.]][[1]][["testing_baseline"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
contrast_name = "anystopvscorrectgo_ROI_"))
names(SST_site_data) <- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
SST_site <- all_site %>%
map(.,~enet_tuning_site(resp_var =resp_chosen,
data_list = SST_site_data[[.]]))
names(SST_site)<- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
return(SST_site)
}
MID_cross_site_CFA <- function(resp_chosen){
# doParallel::registerDoParallel()
### map over all sites instead of one single response variable
MID_site_data <- all_site %>%
map(.,~enet_site_nocom(resp_var = resp_chosen,
split_train =left_join(analysis(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["enet_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["RanFor_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_test = left_join(test_site_split[[.]],
CFA_list_site[[.]][[1]][["testing_baseline"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
contrast_name = "antiRewVsNeu_ROI_"))
names(MID_site_data)<- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
MID_site <- all_site %>%
map(.,~enet_tuning_site(resp_var =resp_chosen,
data_list = MID_site_data[[.]]))
names(MID_site) <- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
return(MID_site)
}
smri_cross_site_CFA <- function(resp_chosen){
# doParallel::registerDoParallel()
### map over all sites instead of one single response variable
smri_site_data <- all_site %>%
map(.,~enet_site_com(resp_var = resp_chosen,
split_train = left_join(analysis(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["enet_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["RanFor_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_test = left_join(test_site_split[[.]],
CFA_list_site[[.]][[1]][["testing_baseline"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
measure_one = "ASEG_",
measure_two = "Dest_Thick_"))
names(smri_site_data)<- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
smri_site <- all_site %>%
map(.,~enet_tuning_site(resp_var =resp_chosen,
data_list = smri_site_data[[.]]))
names(smri_site)<- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
return(smri_site)
}
rsmri_cross_site_CFA <- function(resp_chosen){
# doParallel::registerDoParallel()
### map over all sites instead of one single response variable
rsmri_site_data <- all_site %>%
map(.,~enet_site_com(resp_var = resp_chosen,
split_train = left_join(analysis(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["enet_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["RanFor_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_test = left_join(test_site_split[[.]],
CFA_list_site[[.]][[1]][["testing_baseline"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
measure_one = "par",
measure_two = "Within"))
names(rsmri_site_data) <- all_sample %>%
map (.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
rsmri_site <- all_site %>%
map(.,~enet_tuning_site(resp_var =resp_chosen,
data_list = rsmri_site_data[[.]]))
names(rsmri_site) <- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
return(rsmri_site)
}
DTI_cross_site_CFA <- function(resp_chosen){
# doParallel::registerDoParallel()
### map over all sites instead of one single response variable
DTI_site_data <- all_site %>%
map(.,~enet_site_comOne(resp_var = resp_chosen,
split_train =
left_join(analysis(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["enet_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_valid=left_join(assessment(train_valid_split[[.]]$splits[[1]]),
CFA_list_site[[.]][[1]][["RanFor_tuning"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
split_test = left_join(test_site_split[[.]],
CFA_list_site[[.]][[1]][["testing_baseline"]],
by="SUBJECTKEY")%>%
distinct(SUBJECTKEY, .keep_all = TRUE) ,
measure_one = "FA_"))
names(DTI_site_data) <- all_sample %>%
map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
DTI_site <- all_site %>%
map(.,~enet_tuning_site(resp_var =resp_chosen,
data_list = DTI_site_data[[.]]))
names(DTI_site)<- all_sample %>% map(.,~unique(assessment(site_vfold$splits[[which(site_vfold$id==.)]])$SITE_ID_L))
return(DTI_site)
}
Nback_enet_cross_site_CFA <- CFA_resp_site %>% map(.,~Nback_cross_site_CFA(resp_chosen =.))
SST_enet_cross_site_CFA <- CFA_resp_site %>% map(.,~SST_cross_site_CFA(resp_chosen =.))
MID_enet_cross_site_CFA <- CFA_resp_site %>% map(.,~MID_cross_site_CFA(resp_chosen =.))
smri_enet_cross_site_CFA <- CFA_resp_site %>% map(.,~smri_cross_site_CFA(resp_chosen =.))
rsmri_enet_cross_site_CFA <- CFA_resp_site%>% map(.,~rsmri_cross_site_CFA(resp_chosen =.))
DTI_enet_cross_site_CFA <- CFA_resp_site %>% map(.,~DTI_cross_site_CFA(resp_chosen =.))
rf_enet_cross_site_CFA <- list("Nback_site"=Nback_enet_cross_site_CFA,
"SST_site"=SST_enet_cross_site_CFA,
"MID_site"=MID_enet_cross_site_CFA,
"smri_site"=smri_enet_cross_site_CFA,
"rsmri_site"=rsmri_enet_cross_site_CFA,
"DTI_site"=DTI_enet_cross_site_CFA)
all_cores <- parallel::detectCores(logical = FALSE) - 2
mri_site <- paste0(mri_vec,"_site")
for(i in 1:length(mri_site)){
names(rf_enet_cross_site_CFA[[mri_site[i]]])<- CFA_resp_site
}
random_forest_validsite_CFA <- CFA_resp_site %>% map(.,function(resp_idx=.){
random_forest_onesite <- all_site %>%map(.,
~rnd_for_data_site(pred_select = "predicted",
data_list=rf_enet_cross_site_CFA,site_idx = .,
resp_chose=resp_idx,
resp_data=CFA_list_site[[.]][[1]][["RanFor_tuning"]]) )
names(random_forest_onesite) <- all_site
return(random_forest_onesite)
})
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(resp_chose)` instead of `resp_chose` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
names(random_forest_validsite_CFA) <- CFA_resp_site
random_forest_testsite_CFA <- CFA_resp_site %>% map(.,function(resp_idx=.){
random_forest_onesite <- all_site %>%
map(.,~rnd_for_data_site(pred_select = "predicted_test",
data_list=rf_enet_cross_site_CFA,
site_idx = .,
resp_chose=resp_idx,
resp_data=CFA_list_site[[.]][[1]][["testing_baseline"]]))
names(random_forest_onesite) <- all_site
return(random_forest_onesite)
})
names(random_forest_testsite_CFA) <- CFA_resp_site
##keep from using all the cores
##keep your computer safe
no_cores <- availableCores() - 4
plan(multisession,workers= no_cores)
### this computation takes more than 10 cores and around 50 gb ram
random_forest_crosssite_CFA <- CFA_resp_site %>% future_map(.,function(resp_idx=.){
random_forest_oneresp <-all_site %>% future_map(.,~random_forest_site(resp = resp_idx,
valid_list = random_forest_validsite_CFA,
test_list = random_forest_testsite_CFA,site_chose = .),
.options = furrr_options(seed=TRUE))
names(random_forest_oneresp) <- all_site
return(random_forest_oneresp)
},.options = furrr_options(seed=TRUE))
### enet
modality_site_CFA <-names(rf_enet_cross_site_CFA) %>% str_remove_all("_site")
names(rf_enet_cross_site_CFA)<- modality_site_CFA
cross_site_enet_performance_CFA = function(resp_vec,data_input,modality_input,target_data){
data_split <- data_input[[modality_input]]
one_modality_extract <- resp_vec %>%
map(.,function(resp_input=.){
performance_extract <- all_site %>%
map(.,function(site_idx){
performance_site <-
data_split[[resp_input]][[site_idx]][["param_tune"]][["test_performance"]]
mae_frame <- left_join(data_split[[resp_input]][[site_idx]][["predicted_test"]],
target_data[[site_idx]][[1]][["testing_baseline"]],
by="SUBJECTKEY")%>%
select("pred",all_of(resp_input) ,"SUBJECTKEY") %>%
distinct(SUBJECTKEY, .keep_all = TRUE)%>% drop_na()
mae_frame <- mae_frame %>% mutate(scaled_target = as.vector(scale(mae_frame[[resp_input]])) )
## the data has to be referenced like this to be put into this function
mae_perform <- yardstick::mae(data = mae_frame,
truth=scaled_target,
estimate=pred)
rmse_perform <- yardstick::rmse(data = mae_frame,
truth=scaled_target,
estimate=pred)
tradRsq_perform <- yardstick::rsq_trad(data = mae_frame,
truth=scaled_target,
estimate=pred)
combined_results <- tibble(performance = as.numeric(performance_site),
site=site_idx,
modality=modality_input,
response=resp_input,
mae_performance=mae_perform$.estimate,
rmse_performance=rmse_perform$.estimate,
tradRsq_performance=tradRsq_perform$.estimate
)
return(combined_results)
})%>%
bind_rows()
})
}
performance_enet_cross_site_CFA <- modality_vec %>%
map(.,~cross_site_enet_performance_CFA(resp_vec = CFA_resp_site,
data_input = rf_enet_cross_site_CFA,
modality_input = .,target_data =CFA_list_site))
names(performance_enet_cross_site_CFA) <- modality_vec
for(i in 1: length(modality_vec)){
names(performance_enet_cross_site_CFA[[modality_vec[i]]])<- CFA_resp_site
}
response_enet_cross_site_CFA <- CFA_resp_site %>%
map(.,function(resp_input=.,
data_input=performance_enet_cross_site_CFA){
one_resp_performance <- modality_site_CFA %>%
map(.,function(modality_input=.){
one_resp_modality <- data_input[[modality_input]][[resp_input]]
return(one_resp_modality)
})%>% bind_rows()
one_resp_performance_test<- one_resp_performance
return(one_resp_performance_test)
})
names(response_enet_cross_site_CFA) <- CFA_resp_site
### random forest
names(random_forest_crosssite_CFA)<- CFA_resp_site
cross_site_rf_performance_CFA = function(resp_idx,data_input=random_forest_crosssite,
plotting_frame=resp_var_plotting,target_data=test_site_split){
site_performance <- all_site %>% map(.,function(site_idx){
one_site <- data_input[[resp_idx]][[site_idx]][["param_tune"]][["performance"]]
mae_frame <- left_join(data_input[[resp_idx]][[site_idx]][["predicted"]],
target_data[[site_idx]][[1]][["testing_baseline"]],
by="SUBJECTKEY")%>% select("pred",all_of(resp_idx) ,"SUBJECTKEY") %>%
distinct(SUBJECTKEY, .keep_all = TRUE)%>% drop_na()
mae_frame <- mae_frame %>% mutate(scaled_target = as.vector(scale(mae_frame[[resp_idx]])) )
## the data has to be referenced like this to be put into this function
mae_perform <- yardstick::mae(data = mae_frame,truth=scaled_target,estimate=pred)
rmse_perform <- yardstick::rmse(data = mae_frame,truth=scaled_target,estimate=pred)
tradRsq_perform <- yardstick::rsq_trad(data = mae_frame,truth=scaled_target,estimate=pred)
combined_results <- tibble(performance = as.numeric(one_site),
site=site_idx,
modality="random_forest",
response=resp_idx,
longer_name = plotting_frame$longer_name[which(plotting_frame$response==response)],
shorter_name = plotting_frame$short_name[which(plotting_frame$response==response[1])],
mae_performance=mae_perform$.estimate,
rmse_performance=rmse_perform$.estimate,
tradRsq_performance=tradRsq_perform$.estimate)
return(combined_results)
})%>% bind_rows()
}
performance_rf_cross_site_CFA <- CFA_resp_site %>%
map(.,~cross_site_rf_performance_CFA(resp_idx=.,
data_input = random_forest_crosssite_CFA,
plotting_frame = CFA_var_plotting,
target_data =CFA_list_site))
names(performance_rf_cross_site_CFA)<- CFA_resp_site
performance_all_modalities_CFA <- CFA_resp_site %>%
map(.,~bind_rows(performance_rf_cross_site_CFA[[.]],
response_enet_cross_site_CFA[[.]])%>%
mutate(site_num=str_remove(.[['site']],"site") ) )
names(performance_all_modalities_CFA)<- CFA_resp_site
#modality_ploting <- c("Stacked","NBack","MID", "SST","rs_fMRI","sMRI","DTI")
performance_all_modalities_CFA <- performance_all_modalities_CFA %>%
map(.,~change_modality_names(performance_frame = .))
CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]],
aes(site_num, performance,
col = modality))+
geom_point(key_glyph = "point", size = 3)+
labs(x = 'Site',
y=NULL,
col = 'Model',
title = paste0("Out-of-site Predictive Ability (Pearson's r)\n",
CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]]))+
theme(axis.text.x=element_text(size=16, angle = 45),
axis.text.y=element_text(size=16),
axis.title.y=element_text(size=18),
axis.title.x=element_text(size=18),
plot.title=element_text(size=20),
legend.position = "right",
legend.title=element_blank(),
legend.text = element_text( size=18)))
[[1]]
## Warning: Removed 3 rows containing missing values.
CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]],
aes(site_num, performance_all_modalities_CFA [[.]]$mae_performance,
col = modality))+
geom_point(key_glyph = "point", size = 3)+
labs(x = 'Site',
y=NULL,
col = 'Model',
title = paste0("Out-of-site Predictive Ability (MAE)\n",
CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]]))+
theme(axis.text.x=element_text(size=16, angle = 45),
axis.text.y=element_text(size=16),
axis.title.y=element_text(size=18),
axis.title.x=element_text(size=18),
plot.title=element_text(size=20),
legend.position = "right",
legend.title=element_blank(),
legend.text = element_text( size=18)))
[[1]]
## Warning: Removed 3 rows containing missing values.
CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]],
aes(site_num, performance_all_modalities_CFA [[.]]$rmse_performance,
col = modality))+
geom_point(key_glyph = "point", size = 3)+
labs(x = 'Site',
y=NULL,
col = 'Model',
title = paste0("Out-of-site Predictive Ability (RMSE)\n",
CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]]))+
theme(axis.text.x=element_text(size=16, angle = 45),
axis.text.y=element_text(size=16),
axis.title.y=element_text(size=18),
axis.title.x=element_text(size=18),
plot.title=element_text(size=20),
legend.position = "right",
legend.title=element_blank(),
legend.text = element_text( size=18)))
[[1]]
## Warning: Removed 3 rows containing missing values.
CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]],
aes(site_num, performance_all_modalities_CFA [[.]]$tradRsq_performance,
col = modality))+
geom_point(key_glyph = "point", size = 3)+
labs(x = 'Site',
y=NULL,
col = 'Model',
title = paste0("Out-of-site Predictive Ability (R squared)\n",
CFA_var_plotting$short_name[[which(CFA_var_plotting$response==.)]]))+
theme(axis.text.x=element_text(size=16, angle = 45),
axis.text.y=element_text(size=16),
axis.title.y=element_text(size=18),
axis.title.x=element_text(size=18),
plot.title=element_text(size=20),
legend.position = "right",
legend.title=element_blank(),
legend.text = element_text( size=18)))
[[1]]
## Warning: Removed 3 rows containing missing values.
### getting summary statistics
performance_all_modalities_CFA <- performance_all_modalities_CFA %>% map(.,~mutate(.,Pearson_r =performance)%>%
mutate(.,R_squared = tradRsq_performance)%>%
mutate(.,MAE=mae_performance)%>%
mutate(.,RMSE=rmse_performance))
performance_all_modalities_CFA %>% map(.,~group_by(.,modality)%>%
summarise_at(c("Pearson_r","R_squared","MAE","RMSE"),
mean,na.rm=TRUE)%>%
mutate_at(vars(c("Pearson_r","R_squared","MAE","RMSE")),funs(round(.,3)))%>%
kableExtra::kbl(caption = paste0("Mean of all modalities cross sites ",
paste0("\n",resp_var_plotting$longer_name[which(resp_var_plotting$response==.)]))) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
)
$g_second_order
modality | Pearson_r | R_squared | MAE | RMSE |
---|---|---|---|---|
Stacked | 0.461 | 0.211 | 0.699 | 0.887 |
NBack | 0.410 | 0.168 | 0.718 | 0.910 |
MID | 0.223 | 0.049 | 0.774 | 0.973 |
SST | 0.136 | 0.017 | 0.784 | 0.989 |
rs_fMRI | 0.261 | 0.068 | 0.763 | 0.964 |
sMRI | 0.246 | 0.062 | 0.763 | 0.967 |
DTI | 0.220 | 0.047 | 0.767 | 0.975 |
performance_all_modalities_CFA %>% map(.,~group_by(.,modality)%>%
summarise_at(c("Pearson_r","R_squared","MAE","RMSE"),
sd,na.rm=TRUE)%>%
mutate_at(vars(c("Pearson_r","R_squared","MAE","RMSE")),funs(round(.,3)))%>%
kableExtra::kbl(caption = paste0("Standard deviation of all modalities cross sites ",
paste0("\n",resp_var_plotting$longer_name[which(resp_var_plotting$response==.)]))) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
)
$g_second_order
modality | Pearson_r | R_squared | MAE | RMSE |
---|---|---|---|---|
Stacked | 0.055 | 0.050 | 0.023 | 0.028 |
NBack | 0.063 | 0.051 | 0.027 | 0.029 |
MID | 0.093 | 0.044 | 0.019 | 0.023 |
SST | 0.066 | 0.020 | 0.012 | 0.010 |
rs_fMRI | 0.058 | 0.029 | 0.015 | 0.015 |
sMRI | 0.088 | 0.043 | 0.022 | 0.023 |
DTI | 0.071 | 0.035 | 0.017 | 0.018 |
corr_plot <- CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]],
aes(site_num, performance,
col = modality))+
geom_point(key_glyph = "point", size = 3)+
labs(x = NULL,
y=NULL,
col = 'Model',
title = paste0("Pearson's r"))+
theme(axis.text.x=element_text(size=14, angle = 45),
axis.text.y=element_text(size=14),
axis.title.y=element_text(size=16),
axis.title.x=element_text(size=16),
plot.title=element_text(size=18),
legend.position = "right",
legend.title=element_blank(),
legend.text = element_text( size=18)))
mae_plot <- CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]],
aes(site_num, performance_all_modalities_CFA [[.]]$mae_performance,
col = modality))+
geom_point(key_glyph = "point", size = 3)+
labs(x = NULL,
y=NULL,
col = 'Model',
title = paste0("MAE"))+
theme(axis.text.x=element_text(size=14, angle = 45),
axis.text.y=element_text(size=14),
axis.title.y=element_text(size=16),
axis.title.x=element_text(size=16),
plot.title=element_text(size=18),
legend.position = "right",
legend.title=element_blank(),
legend.text = element_text( size=18)))
rmse_plot <- CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]],
aes(site_num, performance_all_modalities_CFA [[.]]$rmse_performance,
col = modality))+
geom_point(key_glyph = "point", size = 3)+
labs(x = NULL,
y=NULL,
col = 'Model',
title = paste0("RMSE"))+
theme(axis.text.x=element_text(size=14, angle = 45),
axis.text.y=element_text(size=14),
axis.title.y=element_text(size=16),
axis.title.x=element_text(size=16),
plot.title=element_text(size=18),
legend.position = "right",
legend.title=element_blank(),
legend.text = element_text( size=18)))
rsq_plot <- CFA_resp_site %>%map(.,~ggplot(performance_all_modalities_CFA [[.]],
aes(site_num, performance_all_modalities_CFA [[.]]$tradRsq_performance,
col = modality))+
geom_point(key_glyph = "point", size = 3)+
labs(x = NULL,
y=NULL,
col = 'Model',
title = paste0("R squared"))+
theme(axis.text.x=element_text(size=14, angle = 45),
axis.text.y=element_text(size=14),
axis.title.y=element_text(size=16),
axis.title.x=element_text(size=16),
plot.title=element_text(size=18),
legend.position = "right",
legend.title=element_blank(),
legend.text = element_text( size=18)))
title_combined_plot <- ggdraw() +
draw_label(
"Out-of-site Predictive Ability",
fontface = 'bold',
x = 0,
hjust = 0,
size=24
) +
theme(
# add margin on the left of the drawing canvas,
# so title is aligned with left edge of first plot
plot.margin = margin(0, 0, 0, 5)
)
plot_combined_list <- list (corr_plot= corr_plot[[1]],
rsq_plot = rsq_plot[[1]],
mae_plot = mae_plot[[1]],
rmse_plot=rmse_plot[[1]])
combined_plot <- ggpubr::ggarrange(title_combined_plot,
ggpubr::ggarrange(plotlist=plot_combined_list,
common.legend = TRUE,
legend = "right"),
nrow = 2 , heights = c(0.1, 1))
## Warning: Removed 3 rows containing missing values.
## Removed 3 rows containing missing values.
## Removed 3 rows containing missing values.
## Removed 3 rows containing missing values.
## Removed 3 rows containing missing values.
ggpubr::annotate_figure(combined_plot,bottom = ggpubr::text_grob("Site",size=15))
pander::pander(sessionInfo())
R version 4.1.3 (2022-03-10)
Platform: x86_64-w64-mingw32/x64 (64-bit)
locale: LC_COLLATE=English_New Zealand.1252, LC_CTYPE=English_New Zealand.1252, LC_MONETARY=English_New Zealand.1252, LC_NUMERIC=C and LC_TIME=English_New Zealand.1252
attached base packages: stats4, grid, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: fastshap(v.0.0.7), ggsegJHU(v.1.0.1.001), ggsegDesterieux(v.1.0.1.002), ggsegExtra(v.1.5.33.004), ggseg3d(v.1.6.3), ggseg(v.1.6.4), vip(v.0.3.2), sva(v.3.42.0), BiocParallel(v.1.28.3), genefilter(v.1.76.0), mgcv(v.1.8-39), nlme(v.3.1-155), corpcor(v.1.6.10), permimp(v.1.0-2), party(v.1.3-10), strucchange(v.1.5-2), sandwich(v.3.0-1), zoo(v.1.8-10), modeltools(v.0.2-23), mvtnorm(v.1.1-3), furrr(v.0.3.0), future(v.1.25.0), plotly(v.4.10.0), cowplot(v.1.1.1), yardstick(v.0.0.9), workflowsets(v.0.2.1), workflows(v.0.2.6), tune(v.0.2.0), rsample(v.0.1.1), recipes(v.0.2.0), parsnip(v.0.2.1), modeldata(v.0.1.1), infer(v.1.0.0), dials(v.0.1.1), scales(v.1.2.0), tidymodels(v.0.2.0), pander(v.0.6.4), olsrr(v.0.5.3), glmnet(v.4.1-4), Matrix(v.1.4-0), eNetXplorer(v.1.1.3), broom(v.0.8.0), forcats(v.0.5.1), stringr(v.1.4.0), dplyr(v.1.0.8), purrr(v.0.3.4), readr(v.2.1.2), tidyr(v.1.2.0), tibble(v.3.1.6), ggplot2(v.3.3.6) and tidyverse(v.1.3.1)
loaded via a namespace (and not attached): rgl(v.0.108.3), Hmisc(v.4.6-0), svglite(v.2.1.0), class(v.7.3-20), foreach(v.1.5.2), crayon(v.1.5.1), MASS(v.7.3-55), backports(v.1.4.1), reprex(v.2.0.1), rlang(v.1.0.2), XVector(v.0.34.0), readxl(v.1.4.0), extrafontdb(v.1.0), nloptr(v.2.0.0), limma(v.3.50.3), naniar(v.0.6.1), extrafont(v.0.18), bit64(v.4.0.5), glue(v.1.6.2), parallel(v.4.1.3), oro.nifti(v.0.11.0), AnnotationDbi(v.1.56.2), BiocGenerics(v.0.40.0), classInt(v.0.4-3), haven(v.2.5.0), tidyselect(v.1.1.2), RRPP(v.1.2.3), XML(v.3.99-0.9), calibrate(v.1.7.7), sf(v.1.0-7), ggpubr(v.0.4.0), SuppDists(v.1.1-9.7), xtable(v.1.8-4), magrittr(v.2.0.3), evaluate(v.0.15), cli(v.3.2.0), zlibbioc(v.1.40.0), rstudioapi(v.0.13), sp(v.1.4-7), DiceDesign(v.1.9), bslib(v.0.3.1), rpart(v.4.1.16), pbmcapply(v.1.5.1), numform(v.0.7.0), xfun(v.0.30), cluster(v.2.1.2), caTools(v.1.18.2), KEGGREST(v.1.34.0), RNifti(v.1.4.0), expm(v.0.999-6), ape(v.5.6-2), listenv(v.0.8.0), reshape(v.0.8.9), Biostrings(v.2.62.0), png(v.0.1-7), ipred(v.0.9-12), withr(v.2.5.0), ggforce(v.0.3.3), neurobase(v.1.32.1), bitops(v.1.0-7), freesurfer(v.1.6.8), ranger(v.0.13.1), plyr(v.1.8.6), cellranger(v.1.1.0), hardhat(v.0.2.0), e1071(v.1.7-9), pROC(v.1.18.0), coda(v.0.19-4), pillar(v.1.7.0), RcppParallel(v.5.1.5), gplots(v.3.1.3), cachem(v.1.0.6), multcomp(v.1.4-19), fs(v.1.5.2), raster(v.3.5-15), geomorph(v.4.0.3), vctrs(v.0.4.0), pbivnorm(v.0.6.0), ellipsis(v.0.3.2), generics(v.0.1.2), nortest(v.1.0-4), lava(v.1.6.10), rgdal(v.1.5-31), tools(v.4.1.3), foreign(v.0.8-82), tweenr(v.1.0.2), munsell(v.0.5.0), emmeans(v.1.7.3), proxy(v.0.4-26), fastmap(v.1.1.0), compiler(v.4.1.3), abind(v.1.4-5), stars(v.0.5-5), DescTools(v.0.99.45), semPlot(v.1.1.5), GenomeInfoDbData(v.1.2.7), prodlim(v.2019.11.13), gridExtra(v.2.3), OpenMx(v.2.20.6), edgeR(v.3.36.0), lattice(v.0.20-45), utf8(v.1.2.2), jsonlite(v.1.8.0), arm(v.1.12-2), GGally(v.2.1.2), gld(v.2.6.4), pbapply(v.1.5-0), carData(v.3.0-5), estimability(v.1.3), lazyeval(v.0.2.2), car(v.3.0-13), R.utils(v.2.11.0), latticeExtra(v.0.6-29), goftest(v.1.2-3), checkmate(v.2.0.0), rmarkdown(v.2.14), openxlsx(v.4.2.5), webshot(v.0.5.3), Biobase(v.2.54.0), igraph(v.1.2.11), survival(v.3.2-13), numDeriv(v.2016.8-1.1), yaml(v.2.3.5), timeROC(v.0.4), systemfonts(v.1.0.4), survivalROC(v.1.0.3), htmltools(v.0.5.2), memoise(v.2.0.1), lavaan(v.0.6-11), locfit(v.1.5-9.5), IRanges(v.2.28.0), viridisLite(v.0.4.0), digest(v.0.6.29), assertthat(v.0.2.1), timereg(v.2.0.2), lwgeom(v.0.2-8), Rttf2pt1(v.1.3.10), units(v.0.8-0), RSQLite(v.2.2.13), future.apply(v.1.9.0), rockchalk(v.1.8.151), Exact(v.3.1), data.table(v.1.14.2), blob(v.1.2.3), R.oo(v.1.24.0), S4Vectors(v.0.32.4), lhs(v.1.1.5), splines(v.4.1.3), Formula(v.1.2-4), labeling(v.0.4.2), RCurl(v.1.98-1.6), pec(v.2022.03.06), hms(v.1.1.1), modelr(v.0.1.8), colorspace(v.2.0-3), base64enc(v.0.1-3), mnormt(v.2.0.2), survcomp(v.1.44.1), shape(v.1.4.6), tmvnsim(v.1.0-2), libcoin(v.1.0-9), nnet(v.7.3-17), sass(v.0.4.1), Rcpp(v.1.0.8), coin(v.1.4-2), GPfit(v.1.0-8), fansi(v.1.0.3), tzdb(v.0.3.0), parallelly(v.1.31.1), R6(v.2.5.1), lifecycle(v.1.0.1), rootSolve(v.1.8.2.3), zip(v.2.2.0), semTools(v.0.5-6), ggsignif(v.0.6.3), minqa(v.1.2.4), mi(v.1.0), jquerylib(v.0.1.4), qgraph(v.1.9.2), glasso(v.1.11), TH.data(v.1.1-1), RColorBrewer(v.1.1-3), iterators(v.1.0.14), gower(v.1.0.0), htmlwidgets(v.1.5.4), polyclip(v.1.10-0), GPArotation(v.2022.4-1), terra(v.1.5-21), rvest(v.1.0.2), globals(v.0.14.0), lmom(v.2.8), htmlTable(v.2.4.0), codetools(v.0.2-18), matrixStats(v.0.62.0), lubridate(v.1.8.0), randomForest(v.4.7-1), gtools(v.3.9.2), prettyunits(v.1.1.1), psych(v.2.2.3), dbplyr(v.2.1.1), R.methodsS3(v.1.8.1), GenomeInfoDb(v.1.30.1), gtable(v.0.3.0), DBI(v.1.1.2), visdat(v.0.5.3), smoothr(v.0.2.2), httr(v.1.4.3), highr(v.0.9), KernSmooth(v.2.23-20), vroom(v.1.5.7), stringi(v.1.7.6), progress(v.1.2.2), reshape2(v.1.4.4), farver(v.2.1.0), annotate(v.1.72.0), fdrtool(v.1.2.17), magick(v.2.7.3), timeDate(v.3043.102), lisrelToR(v.0.1.4), xml2(v.1.3.3), boot(v.1.3-28), kableExtra(v.1.3.4), rmeta(v.3.0), lme4(v.1.1-28), sem(v.3.1-14), kutils(v.1.70), bit(v.4.0.4), jpeg(v.0.1-9), pkgconfig(v.2.0.3), rstatix(v.0.7.0), bootstrap(v.2019.6) and knitr(v.1.39)