## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)

## ----echo = FALSE, results = "hide", warning = FALSE, eval = TRUE-------------
suppressPackageStartupMessages({
    library(pbapply)
    library(Matrix)
    library(igraph)
})

## ----eval = TRUE--------------------------------------------------------------
library(pbapply)
library(Matrix)
library(igraph)
library(SmCCNet)
library(parallel)

## ----flowPlot, out.width="100%", fig.cap = "Workflow for SmCCNet single-omics setting (binary and quantitative phenotype).", echo = FALSE----
knitr::include_graphics("figures/single-omics-smccnet.jpg")

## ----example data-------------------------------------------------------------
data(ExampleData)
head(X1[ , 1:6])
head(Y)

## ----p1p2, eval = TRUE--------------------------------------------------------
p1 <- ncol(X1)
N <- nrow(X1)
AbarLabel <- colnames(X1)

## ----eval = FALSE-------------------------------------------------------------
# # preprocess data
# processed_data <- dataPreprocess(X = as.data.frame(X1), covariates = NULL,
#                   is_cv = TRUE, cv_quantile = 0.2, center = TRUE, scale = TRUE)

## ----CVpara, eval = FALSE, warning = FALSE------------------------------------
# K <- 3 # number of folds in K-fold CV.
# s1 <- 0.7
# subSamp <- 50 # number of subsamples (will be described in later section).
# 
# # create sparsity penalty options.
# pen1 <- seq(.05, .3, by = .05)
# 
# # set a CV directory.
# pheno <- "Example3foldCV"
# CVDir <- file.path(tempdir(),"Example3foldCV")
# dir.create(CVDir,showWarnings = FALSE)

## ----make K-fold, eval = FALSE------------------------------------------------
# # set random seed
# set.seed(12345)
# 
# # save data and parameters into local directory
# save(X1, Y, s1, subSamp, pen1,
#      file = paste0(CVDir, "Data.Rdata"))
# 
# # split data into K folds
# foldIdx <- split(1:N, sample(1:N, K))
# for(i in 1:K){
#   iIdx <- foldIdx[[i]]
#   x1.train <- scale(X1[-iIdx, ])
#   yy.train <- Y[-iIdx, ]
#   x1.test <- scale(X1[iIdx, ])
#   yy.test <- Y[iIdx, ]
# 
#   if(is.na(min(min(x1.train), min(yy.train), min(x1.test), min(yy.test)))){
#     stop("Invalid scaled data.")
#   }
# 
#   subD <- paste0(CVDir, "CV_", i, "/")
#   dir.create(subD)
# 
#   # save data from each fold into local directory
#   save(x1.train, yy.train, x1.test, yy.test,
#        pen1, p1,
#        file = paste0(subD, "Data.Rdata"))
# }

## ----run CV, eval = FALSE-----------------------------------------------------
# # number of clusters in parSapply should be the same as number specified above
# suppressWarnings(for (CVidx in 1:K)
# {
#   # define the sub-directory for each fold
#   subD <- paste0(CVDir, "CV_", CVidx, "/")
#   # load fold data
#   load(paste0(subD, "Data.Rdata"))
#   dir.create(paste0(subD, "SmCCA/"))
#   # create empty vector to store cross-validation result
#   RhoTrain <- RhoTest <- DeltaCor <- rep(0, length(pen1))
#   # evaluate through all the possible penalty candidates
#   for(idx in 1:length(pen1)){
#     l1 <- pen1[idx]
#     print(paste0("Running SmCCA on CV_", CVidx, " pen=", l1))
#     # run single-omics SmCCNet
#     Ws <- getRobustWeightsSingle(x1.train, as.matrix(yy.train), l1, 1,
#                                     SubsamplingNum = 1)
#     # average
#     meanW <- rowMeans(Ws)
#     v <- meanW[1:p1]
# 
#     rho.train <-  cor(x1.train %*% v, yy.train)
# 
# 
#     rho.test <- cor(x1.test %*% v, yy.test)
# 
# 
#     RhoTrain[idx] <- round(rho.train, digits = 5)
#     RhoTest[idx] <- round(rho.test, digits = 5)
#     DeltaCor[idx] <- abs(rho.train - rho.test)
# 
# 
# 
#   }
# 
#   DeltaCor.all <- cbind(pen1, RhoTrain, RhoTest, DeltaCor)
#   colnames(DeltaCor.all) <- c("l1", "Training CC", "Test CC", "CC Pred. Error")
#   write.csv(DeltaCor.all,
#             file = paste0(subD, "SmCCA/SCCA_", subSamp,"_allDeltaCor.csv"))
# 
# 
# })

## ----aggregate error, eval = FALSE--------------------------------------------
# # combine prediction errors from all K folds and compute the total prediction
# # error for each sparsity penalty pair.
# aggregateCVSingle(CVDir, "SmCCA", NumSubsamp = subSamp, K = K)

## ----get abar, eval = FALSE---------------------------------------------------
# # set up directory to store all the results
# plotD <- paste0(CVDir, "Figures/")
# saveD <- paste0(CVDir, "Results/")
# dataF <- paste0(CVDir, "Data.Rdata")
# dir.create(plotD)
# dir.create(saveD)
# dir.create(dataF)
# 
# # type of CCA result, only "SmCCA" supported
# Method = "SmCCA"
# 
# 
# # after SmCCA CV, select the best penalty term,
# # and use it for running SmCCA on the complete dataset
# for(Method in "SmCCA"){
#   # select optimal penalty term from CV result
#   T12 <- read.csv(paste0(CVDir, "Results/", Method, "CVmeanDeltaCors.csv"))
#   # calculate evaluation metric **
#   pen <- T12[which.min(T12[ ,4]/abs(T12[ ,3])) ,2]
# 
# 
#   l1 <- pen;
#   system.time({
#     Ws <- getRobustWeightsSingle(X1 = X1, Trait = as.matrix(Y),
#                                        Lambda1 = l1,
#                                        s1, SubsamplingNum = subSamp)
# 
# 
#     Abar <- getAbar(Ws, FeatureLabel = AbarLabel[1:p1])
#     save(l1, X1, Y, s1, Ws, Abar,
#          file = paste0(saveD, Method, K, "foldSamp", subSamp, "_", pen,
#                        ".Rdata"))
#   })
# 
# 
# }

## ----get modules, eval = FALSE------------------------------------------------
# # perform clustering based on the adjacency matrix Abar
# OmicsModule <- getOmicsModules(Abar, PlotTree = FALSE)
# 
# # make sure there are no duplicated labels
# AbarLabel <- make.unique(AbarLabel)
# 
# # calculate feature correlation matrix
# bigCor2 <- cor(X1)
# 
# # data type
# types <- rep('gene', nrow(bigCor2))

## ----eval = FALSE-------------------------------------------------------------
# # filter out network modules with insufficient number of nodes
# module_length <- unlist(lapply(OmicsModule, length))
# network_modules <- OmicsModule[module_length > 10]
# # extract pruned network modules
# for(i in 1:length(network_modules))
# {
#   cat(paste0('For network module: ', i, '\n'))
#   # define subnetwork
#   abar_sub <- Abar[network_modules[[i]],network_modules[[i]]]
#   cor_sub <- bigCor2[network_modules[[i]],network_modules[[i]]]
#   # prune network module
#   networkPruning(Abar = abar_sub,CorrMatrix = cor_sub,
#                           type = types[network_modules[[i]]],
#                  data = X1[,network_modules[[i]]],
# 			  Pheno = Y, ModuleIdx = i, min_mod_size = 10,
#                           max_mod_size = 100, method = 'PCA',
#                           saving_dir = tempdir())
#   cat("\n")
# }

## ----eval = TRUE,echo = FALSE, warning = FALSE, message = FALSE---------------
load('../vignettes/cont_size_16_net_1.Rdata')
row.names(omics_correlation_data) <- NULL
colnames(omics_correlation_data) <- c('Molecular Feature', 'Correlation to Phenotype', 'P-value')
knitr::kable(omics_correlation_data, caption = 'Individual molecular features correlation table with respect to phenotype (correlation and p-value).')

## ----loadings1, echo = FALSE, warning = FALSE, message = FALSE, out.width="100%", fig.cap = "PC1 loading for each subnetwork feature."----
library(grid)
library(tidyverse)
library(shadowtext)
library(reshape2)
BLUE <- "#076fa2"
data <- data.frame(name = row.names(pc_loading), loading = abs(pc_loading[,1]))
plt <- ggplot(data) +
  geom_col(aes(loading, name), fill = BLUE, width = 0.6) 

plt

## ----corheatmap, echo = FALSE, warning = FALSE, message = FALSE, out.width="100%", fig.cap = "Correlation heatmap for subnetwork features."----
###### Correlation heatmap
melted_cormat <- melt(correlation_sub)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile() +
   labs(title = "Correlation Heatmap of Network Features") + 
  theme(plot.title.position = "plot")

## ----adjheatmap, echo = FALSE, warning = FALSE, message = FALSE, out.width="100%", fig.cap = "Adjacency matrix heatmap for subnetwork features."----
###### Correlation heatmap
melted_cormat <- melt(as.matrix(M))
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile() +
   labs(title = "Adjacency Matrix Heatmap of Network Features") + 
  theme(plot.title.position = "plot")


## ----eval = FALSE-------------------------------------------------------------
# library(RCy3)
# library(igraph)
# # load subnetwork data (example, user need to provide the directory)
# load('ResultDirectory/size_a_net_b.Rdata')
# M <- as.matrix(M)
# 
# # correlation matrix for the subnetwork
# filter_index <- which(abs(correlation_sub) < 0.05)
# M_ind <- ifelse(correlation_sub > 0, 1, -1)
# M_adj <- M * M_ind
# M_adj[filter_index] <- 0
# diag(M_adj) <- 0
# 
# # network visualization through cytoscape
# graph <- igraph::graph_from_adjacency_matrix(M_adj, mode = 'undirected',
#          weighted = TRUE, diag = TRUE, add.colnames = NULL, add.rownames = NA)
# 
# # define network node type and connectivity
# V(graph)$type <- sub_type
# V(graph)$type
# V(graph)$connectivity <- rowSums(abs(M))
# V(graph)$connectivity
# 
# createNetworkFromIgraph(graph,"single_omics_network")

## ----netPlot, out.width="100%", fig.cap = "Trimmed module 1. The strength of the node connections is indicated by the thickness of edges. Red edges and blue edges are for negative and positive connections respectively.", echo = FALSE----
knitr::include_graphics("../vignettes/example_network_cont.png")

## ----example data binary------------------------------------------------------
data(ExampleData)
head(X1[ , 1:6])
head(Y)
# binarize quantitative outcome 
Y <- ifelse(Y > median(Y), 1, 0)

## ----p1p2 binary, eval = TRUE-------------------------------------------------
p1 <- ncol(X1)
N <- nrow(X1)
AbarLabel <- colnames(X1)

## ----eval = FALSE-------------------------------------------------------------
# K <- 3 # number of folds in K-fold CV.
# s1 <- 0.7 # feature subsampling percentage.
# subSamp <- 50 # number of subsamples.
# CCcoef <- NULL # unweighted version of SmCCNet.
# metric <- 'auc' # evaluation metric to be used.
# 
# # create sparsity penalty options.
# pen1 <- seq(.1, .9, by = .1)
# 
# # set a CV directory.
# pheno <- "Example3foldCV_Binary"
# CVDir <- file.path(tempdir(),"Example3foldCV_Binary")
# dir.create(CVDir,showWarnings = FALSE)
# 
# Y <- ifelse(Y > median(Y), 1, 0)

## ----make K-fold 2, eval = FALSE----------------------------------------------
# set.seed(12345) # set random seed.
# # save original data into local directory
# save(X1, Y, s1, subSamp, pen1,
#      file = paste0(CVDir, "Data.Rdata"))
# 
# 
# # define the number of components to be extracted
# ncomp <- 3
# # split data into k folds
# foldIdx <- split(1:N, sample(1:N, K))
# for(i in 1:K){
#   iIdx <- foldIdx[[i]]
#   x1.train <- scale(X1[-iIdx, ])
#   yy.train <- Y[-iIdx, ]
#   x1.test <- scale(X1[iIdx, ])
#   yy.test <- Y[iIdx, ]
# 
#   if(is.na(min(min(x1.train), min(yy.train), min(x1.test), min(yy.test)))){
#     stop("Invalid scaled data.")
#   }
# 
#   subD <- paste0(CVDir, "CV_", i, "/")
#   dir.create(subD)
#   save(x1.train, yy.train, x1.test, yy.test,
#       pen1, p1,
#        file = paste0(subD, "Data.Rdata"))
# }
# 
# ############################################## Running Cross-Validation
# 
# # run SmCCA with K-fold cross-validation
# suppressWarnings(for (CVidx in 1:K){
#   # define evaluation method
#   EvalMethod <- 'precision'
#   # define and create saving directory
#   subD <- paste0(CVDir, "CV_", CVidx, "/")
#   load(paste0(subD, "Data.Rdata"))
#   dir.create(paste0(subD, "SmCCA/"))
# 
#   # create empty vector to store cross-validation accuracy result
#   TrainScore <- TestScore <- rep(0, length(pen1))
#   for(idx in 1:length(pen1)){
#     # define value of penalty
#     l1 <- pen1[idx]
# 
#     # run SPLS-DA to extract canonical weight
#     Ws <- spls::splsda(x = x1.train, y = yy.train, K = ncomp, eta = l1,
#                        kappa=0.5,
#                        classifier= 'logistic', scale.x=FALSE)
# 
#     # create emtpy matrix to save canonical weights for each subsampling
#     weight <- matrix(0,nrow = ncol(x1.train), ncol = ncomp)
#     weight[Ws[["A"]],] <- Ws[["W"]]
# 
#     # train the latent factor model with logistic regression
#     train_data <- data.frame(x = (x1.train %*% weight)[,1:ncomp],
#                              y = as.factor(yy.train))
#     test_data <- data.frame(x = (x1.test %*% weight)[,1:ncomp])
# 
#     logisticFit <- stats::glm(y~., family = 'binomial',data = train_data)
#     # make prediction for train/test set
#     train_pred <- stats::predict(logisticFit, train_data, type = 'response')
#     test_pred <- stats::predict(logisticFit, test_data, type = 'response')
#     # specifying which method to use as evaluation metric
#     TrainScore[idx] <- classifierEval(obs = yy.train,
#                                       pred = train_pred,
#                                       EvalMethod = metric, print_score = FALSE)
#     TestScore[idx] <- classifierEval(obs = yy.test,
#                                       pred = test_pred,
#                                       EvalMethod = metric, print_score = FALSE)
# 
#   }
# 
#   # combine cross-validation results
#   DeltaAcc.all <- as.data.frame(cbind(pen1, TrainScore, TestScore))
#   DeltaAcc.all$Delta <- abs(DeltaAcc.all$TrainScore - DeltaAcc.all$TestScore)
#   colnames(DeltaAcc.all) <- c("l1", "Training Score", "Testing Score", "Delta")
# 
#   # save cross-validation results to local directory
#   write.csv(DeltaAcc.all,
#             file = paste0(subD, "SmCCA/SCCA_", subSamp,"_allDeltaCor.csv"))
# 
# }
# )

## ----eval = FALSE-------------------------------------------------------------
# # save cross-validation result
# cv_result <- aggregateCVSingle(CVDir, "SmCCA", NumSubsamp = subSamp, K = 3)
# 
# # create directory to save overall result with the best penalty term
# plotD <- paste0(CVDir, "Figures/")
# saveD <- paste0(CVDir, "Results/")
# dataF <- paste0(CVDir, "Data.Rdata")
# dir.create(plotD)
# dir.create(saveD)
# dir.create(dataF)
# 
# # specify method (only SmCCA works)
# Method = "SmCCA"
# 
# for(Method in "SmCCA"){
#   # read cross validation result in
#   T12 <- read.csv(paste0(CVDir, "Results/", Method, "CVmeanDeltaCors.csv"))
#   # determine the optimal penalty term
#   pen <- T12[which.max(T12[ ,3]) ,2]
#   l1 <- pen;
#   system.time({
#     # run SPLSDA with optimal penalty term
#     Ws <- getRobustWeightsSingleBinary(X1 = X1, Trait = as.matrix(Y),
#                                        Lambda1 = l1,
#                                        s1, SubsamplingNum = subSamp)
# 
#     # get adjacency matrix
#     Abar <- getAbar(Ws, FeatureLabel = AbarLabel[1:p1])
#     # save result into local directory
#     save(l1, X1, Y, s1, Ws, Abar,
#          file = paste0(saveD, Method, K, "foldSamp", subSamp, "_", pen,
#                        ".Rdata"))
#   })
# 
# 
# }

## ----sessionInfo--------------------------------------------------------------
sessionInfo()
warnings()