## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## -----------------------------------------------------------------------------
library(MSstatsSampleSize)

## -----------------------------------------------------------------------------
# # read in protein abundance sheet
# # The CSV sheet has 173 columns from control and cancer groups.
# # Each row is protein and each column (except the first column) is biological replicate.
# # The first column 'Protein' contains the protein names.

# OV_SRM_train <- read.csv(file = "OV_SRM_train.csv")
# # assign the column 'Protein' as row names
# rownames(OV_SRM_train) <- OV_SRM_train$Protein 
# # remove the column 'Protein
# OV_SRM_train <- OV_SRM_train[, colnames(OV_SRM_train)!="Protein"]
head(OV_SRM_train)

# Read in annotation including condition and biological replicates.
# Users should make this annotation file.
# OV_SRM_train_annotation <- read.csv(file="OV_SRM_train_annotation.csv", header=TRUE)
head(OV_SRM_train_annotation)

# estimate the mean protein abunadnce and variance in each condition
variance_estimation <- estimateVar(data = OV_SRM_train, 
                                   annotation = OV_SRM_train_annotation)

# the mean protein abundance in each condition
head(variance_estimation$mu)

# the standard deviation in each condition
head(variance_estimation$sigma)

# the mean protein abundance across all the conditions
head(variance_estimation$promean)

## ---- eval=FALSE--------------------------------------------------------------
#  #  output a pdf file with mean-SD plot
#  meanSDplot(variance_estimation)

## ----message = FALSE , warning = FALSE----------------------------------------
# expected_FC = "data": fold change estimated from OV_SRM_train
# select_simulated_proteins = "proportion": select the simulated proteins based on the proportion of total proteins
# simulate_valid = FALSE: use input OV_SRM_train as validation set
simulated_datasets <- simulateDataset(data = OV_SRM_train,
                                      annotation = OV_SRM_train_annotation,
                                      num_simulations = 10, # simulate 10 times
                                      expected_FC = "data", 
                                      list_diff_proteins =  NULL,
                                      select_simulated_proteins = "proportion",
                                      protein_proportion = 1.0,
                                      protein_number = 1000,
                                      samples_per_group = 50, # 50 samples per condition
                                      simulate_validation = FALSE, 
                                      valid_samples_per_group = 50)

## -----------------------------------------------------------------------------
# the number of simulated proteins
simulated_datasets$num_proteins

# a vector with the number of simulated samples in each condition
simulated_datasets$num_samples

# the list of simulated protein abundance matrices
# Each element of the list represents one simulation
head(simulated_datasets$simulation_train_Xs[[1]]) # first simulation

# the list of simulated condition vectors
# Each element of the list represents one simulation
head(simulated_datasets$simulation_train_Ys[[1]]) # first simulation

## ----message = FALSE , warning = FALSE----------------------------------------
# expected_FC = expected_FC: user defined fold change
unique(OV_SRM_train_annotation$Condition)
expected_FC <- c(1, 1.5)
names(expected_FC) <- c("control", "ovarian cancer")
set.seed(1212)
# Here I randomly select some proteins as differential to show how the function works
# The user should prepare this list of differential proteins by themselves
diff_proteins <- sample(rownames(OV_SRM_train), 20)
simualted_datasets_predefined_FC <- simulateDataset(data = OV_SRM_train,
                                      annotation = OV_SRM_train_annotation,
                                      num_simulations = 10, # simulate 10 times
                                      expected_FC = expected_FC,
                                      list_diff_proteins =  diff_proteins,
                                      select_simulated_proteins = "proportion",
                                      protein_proportion = 1.0,
                                      protein_number = 1000,
                                      samples_per_group = 50, # 50 samples per condition
                                      simulate_validation = FALSE,
                                      valid_samples_per_group = 50)


## ----message = FALSE, warning = FALSE-----------------------------------------
classification_results <- designSampleSizeClassification(
    simulations = simulated_datasets, 
    parallel = FALSE)


## ----message = FALSE, warning = FALSE-----------------------------------------
# the number of simulated proteins
classification_results$num_proteins
# a vector with the number of simulated samples in each condition
classification_results$num_samples
# the mean predictive accuracy over all the simulated datasets,
# which have same 'num_proteins' and 'num_samples'
classification_results$mean_predictive_accuracy
# the mean protein importance vector over all the simulated datasets,
# the length of which is 'num_proteins'.
head(classification_results$mean_feature_importance)

## ----message = FALSE, warning = FALSE-----------------------------------------
## try parallel computation to speed up
## The parallel computation may cause error while allocating the core resource
## Then the users can try the abova function without parallel computation
classification_results_parallel <- designSampleSizeClassification(
    simulations = simulated_datasets, 
    parallel = TRUE)

## ---- eval=FALSE--------------------------------------------------------------
#  #### sample size classification ####
#  # simulate different sample sizes
#  # 1) 10 biological replicats per group
#  # 1) 25 biological replicats per group
#  # 2) 50 biological replicats per group
#  # 3) 100 biological replicats per group
#  # 4) 200 biological replicats per group
#  list_samples_per_group <- c(10, 25, 50, 100, 200)
#  
#  # save the simulation results under each sample size
#  multiple_sample_sizes <- list()
#  
#  for(i in seq_along(list_samples_per_group)){
#      # run simulation for each sample size
#      simulated_datasets <- simulateDataset(data = OV_SRM_train,
#                                        annotation = OV_SRM_train_annotation,
#                                        num_simulations = 10, # simulate 10 times
#                                        expected_FC = "data",
#                                        list_diff_proteins =  NULL,
#                                        select_simulated_proteins = "proportion",
#                                        protein_proportion = 1.0,
#                                        protein_number = 1000,
#                                        samples_per_group = list_samples_per_group[i],
#                                        simulate_valid = FALSE,
#                                        valid_samples_per_group = 50)
#  
#      # run classification performance estimation for each sample size
#      res <- designSampleSizeClassification(simulations = simualted_datasets,
#                                            parallel = TRUE)
#  
#      # save results
#      multiple_sample_sizes[[i]] <- res
#  }
#  
#  ## make the plots
#  designSampleSizeClassificationPlots(multiple_sample_sizes,
#                                      list_samples_per_group,
#                                      ylimUp_predictive_accuracy = 0.8,
#                                      ylimDown_predictive_accuracy = 0.6)

## -----------------------------------------------------------------------------
#  output a pdf file with sample size calculation plot for hypothesis testing
#  also return a table which summaries the plot
HT_res <- designSampleSizeHypothesisTestingPlot(data = OV_SRM_train,
                                                annotation= OV_SRM_train_annotation,
                                                desired_FC = "data",
                                                select_testing_proteins = "proportion",
                                                protein_proportion = 1.0,
                                                protein_number = 1000,
                                                FDR=0.05,
                                                power=0.9)

# data frame with columns desiredFC, numSample, FDR, power and CV
head(HT_res)

## ---- eval=FALSE--------------------------------------------------------------
#  #  output a pdf file with 11 PCA plots
#  designSampleSizePCAplot(simulated_datasets)