BioWorkflows: Multiomics Integration of Immune-Mediated Monogenic Diseases

library(Biobase)

# Set globals
## Baltimore aging cohort hybrid-normalized, calibration-normalized, and median-normalized RFUs
RFUS.IN.PATH = snakemake@input[[1]]#'Reference/ferrucci/raw/CHI-16-018_Hyb.Cal_RFU.txt.adat_RFU.txt'
## Baltimore aging cohort sample metadata
SAMPLES.META.IN.PATH = snakemake@input[[2]]#'Reference/ferrucci/raw/CHI-16-018_Hyb.Cal_RFU.txt.adat_Samples.txt'
## Baltimore aging cohort somamer metadata
SOMAMERS.META.IN.PATH = snakemake@input[[3]]#'Reference/ferrucci/raw/CHI-16-018_Hyb.Cal_RFU.txt.adat_Somamers.txt'

## Baltimore aging cohort eset
ESET.OUT.PATH = snakemake@output[[1]]#'Reference/ferrucci/processed/aging_eset.RDS'

# Load data
RFUs = read.table(RFUS.IN.PATH, sep = '\t', comment.char = '', header = FALSE)
samples.meta = read.table(SAMPLES.META.IN.PATH, sep = '\t', comment.char = '', header = TRUE)
nlines = length(readLines(SOMAMERS.META.IN.PATH)) # There are four lines of comments at the bottom of the file, so we only take the first n - 4 rows
somamers.meta = read.table(SOMAMERS.META.IN.PATH, sep = '\t', comment.char = '',
                           quote = '', header = FALSE, nrow = nlines - 4, row.names = 1)

# Put somamer metadata into a dataframe
somamers.meta = t(somamers.meta)
somamers.meta = as.data.frame(somamers.meta)

# Name the RFUs data frame columns using the somamer ID
# We use somamer IDs rather than the target name to ensure compatibility with the somalogic
# data in the monogenic cohort
colnames(RFUs) = somamers.meta$SomaId

# Name the RFUs data frame forws using the plate ids and positions
ids = paste(samples.meta$PlateId, samples.meta$PlatePosition)
ids = gsub(' ', '_', ids)
rownames(RFUs) = ids

# Convert the RFUs data frame to a matrix
RFUs = as.matrix(RFUs)

# Log transform the data
RFUs = log2(RFUs)

# Create the eset
rownames(somamers.meta) = somamers.meta$SomaId
rownames(samples.meta) = ids
eset = ExpressionSet(t(RFUs))
featureData(eset) = AnnotatedDataFrame(somamers.meta)
phenoData(eset) = AnnotatedDataFrame(samples.meta)

# Save the results
saveRDS(eset, ESET.OUT.PATH)

R Biobase From line 4 of preprocessing/process_ferrucci_data.R

AI.MODELS.PATH = snakemake@input[[1]] #'Classification/results/healthy_rf_models_AI.RDS'
## Random Forest sample meta data for all subjects
FULL.DESGIN.MATRICES.PATH = snakemake@input[[2]] #'Classification/design_matrices/healthy_random_forest_design_matrices_all.RDS'
## Random Forest sample meta data for all subjects
FULL.META.DATA.PATH = snakemake@input[[3]] #'Classification/design_matrices/healthy_random_forest_sample_meta_data_all.RDS'

## The predictive index of each subject
SCORES.OUT.PATH = snakemake@output[[1]] #'Classification/predictions/healthy_rf_PID_predictions_using_AI_index.RDS'

# Load libraries
library(randomForest)
library(Biobase)

# Set seed
set.seed(102409)

# Source utility functions
source('scripts/util/Groups/groups.R')

# Load data
models = readRDS(AI.MODELS.PATH)

full.design.matrices = readRDS(FULL.DESGIN.MATRICES.PATH)
full.meta.data = readRDS(FULL.META.DATA.PATH)

# For each model
scores = mapply(function(model, full.design.matrix) {
  # Make the design matrix for just the PID patients
  X = full.design.matrix[full.meta.data$condition %in% util.get_pid(), ]

  # Get the classifier's predictions
  predict(model, X, type = 'prob')[,'1']
}, models, full.design.matrices)

# Save results
saveRDS(as.data.frame(scores), SCORES.OUT.PATH)

R Biobase randomForest From line 5 of analysis/get_AI_based_HI_on_PID_subjects.R

PERMUTATIONS.IN.PATHS = snakemake@input
PVALS.OUT.PATH = snakemake@output[[1]]#'Classification/results/healthy_rf_pvals_all.RDS'

## Print confirmation that the desired input files are being used
print('Files used for permutation-based pvalue estimation:')
print(PERMUTATIONS.IN.PATHS)

## Load data
results = lapply(PERMUTATIONS.IN.PATHS, readRDS)

## Get the names of the classifiers
classifiers = names(results[[1]])

## For each classifier
p.valss = lapply(classifiers, function(classifier) {
  ## For each permutation test iteration
  p.vals = sapply(results, function(result) {
    ## Get that iteration's pvalues (for each feature)
    ## for that classifier
    result[[classifier]]
  })
  ## Get the average permutation pvalues across
  ## all iterations
  p.vals = rowMeans(p.vals)

  ## Here, we set a lower limit for the pvals, reflecting the fact that permutation tests
  ## have precision based on the number of tests run (#tests per iteration x # iterations).
  ## Note that the number of tests per iteration for a classifer equals the number of features
  ## in that classifier, based on how the permutation iterations were run.
  ## For each classifier
  n.tests = length(p.vals) * length(results)
  ## Get the permutation test precision
  precision = 1 / n.tests
  ## For each feature take the larger of the permutation-based pvalue and the precison
  pmax(p.vals, precision)
})

## Name the pvalues by classifier
names(p.valss) = classifiers

## Save results
saveRDS(p.valss, PVALS.OUT.PATH)

R From line 12 of analysis/get_feature_pvalues.R

PID.MODELS.PATH = snakemake@input[[1]] #'Classification/results/healthy_rf_models_PID.RDS'
## Random Forest design matrices for all subjects
FULL.DESGIN.MATRICES.PATH = snakemake@input[[2]] #'Classification/design_matrices/healthy_random_forest_design_matrices_all.RDS'
## Random Forest sample meta data for all subjects
FULL.META.DATA.PATH = snakemake@input[[3]] #'Classification/design_matrices/healthy_random_forest_sample_meta_data_all.RDS'

## The predictive index of each subject
SCORES.OUT.PATH = snakemake@output[[1]] #'Classification/predictions/healthy_rf_AI_predictions_using_PID_index.RDS'

# Load libraries
library(randomForest)
library(Biobase)

# Set seed
set.seed(102409)

# Source utility functions
source('scripts/util/Groups/groups.R')

# Load data
models = readRDS(PID.MODELS.PATH)

full.design.matrices = readRDS(FULL.DESGIN.MATRICES.PATH)
full.meta.data = readRDS(FULL.META.DATA.PATH)

# For each model
scores = mapply(function(model, full.design.matrix) {
  # Make the design matrix for just the AI patients
  X = full.design.matrix[full.meta.data$condition %in% util.get_ai(), ]

  # Get the classifier's predictions
  predict(model, X, type = 'prob')[,'1']
}, models, full.design.matrices)

# Save results
saveRDS(as.data.frame(scores), SCORES.OUT.PATH)

R Biobase randomForest From line 5 of analysis/get_PID_based_HI_on_AI_subjects.R

TRAINING.MODELS.PATH = snakemake@input[[1]] #'Classification/healthy_random_forest_design_matrices_all.RDS'
## Random Forest design matrices for testing subjects
TESTING.DESGIN.MATRICES.PATH = snakemake@input[[2]] #'Classification/healthy_random_forest_testing_design_matrices_all.RDS'

## The predictive index of each subject
SCORES.OUT.PATH = snakemake@output[[1]] #'Classification/predictions/healthy_rf_testing_predictions_all.RDS'

# Load libraries
library(randomForest)
library(Biobase)

# Set seed
set.seed(102409)

# Source utility functions
source('scripts/util/Groups/groups.R')

# Load data
models = readRDS(TRAINING.MODELS.PATH)
testing.design.matrices = readRDS(TESTING.DESGIN.MATRICES.PATH)

# For each model
scores = mapply(function(model, testing.design.matrix) {
  # Get the classifier's predictions
  predict(model, testing.design.matrix, type = 'prob')[,'1']
}, models, testing.design.matrices)

# Save results
saveRDS(as.data.frame(scores), SCORES.OUT.PATH)

R Biobase randomForest From line 10 of analysis/get_testing_set_HI.R

set.seed(99)

# Load packages and source utilities
library(randomForest)
source('scripts/util/Classification/randomForestClassifier.R')

# Set globals
DESIGN.MATRICES.IN.PATH = snakemake@input[[1]]#'Classification/healthy_random_forest_design_matrices_all.RDS'
META.DATA.IN.PATH = snakemake@input[[2]]#'Classification/healthy_random_forest_sample_meta_data_all.RDS'
GVIS.IN.PATH = snakemake@input[[3]]#'Classification/results/healthy_random_forest_rf_gvis_all.RDS'
CONDITION.GROUPS.IN.PATH = snakemake@input[[4]]#'Classification/condition_groups.RDS'
BACKGROUND.GROUPS.IN.PATH = snakemake@input[[5]]#'Classification/background_groups.RDS'

SNAKEMAKE.OUT.PATH = snakemake@output[[1]]

# We get the group from the output file name
out.file = basename(SNAKEMAKE.OUT.PATH)
out.file = gsub('.RDS$', '', out.file, ignore.case = T)
fields = strsplit(out.file,'_')[[1]]
condition.id = fields[1] # The condition group represents the 'positive' condition for the classifier (e.g. 'healthy', 'cgd', 'xcgd','47cgd','stat1.gof')
background.id = fields[length(fields)] # The background group represents the background pool of all other conditions to consider (e.g. 'PID','AI','all')
i = as.numeric(fields[3])

# We load the condition and background groups
condition.groups = readRDS(CONDITION.GROUPS.IN.PATH)
background.groups = readRDS(BACKGROUND.GROUPS.IN.PATH)

# Get the seed corresponding to each condition so we aren't using the same seeds for each condition
n = 100000
condition.seeds = lapply(condition.groups, function(condition.group){sample.int(n, size = 1)})
background.seeds = lapply(background.groups, function(background.group){sample.int(n, size = 1)})

condition.seed = condition.seeds[[condition.id]]
background.seed = background.seeds[[background.id]]

# Get the conditons to investigate corresponding to the 'condition' field.
condition = condition.groups[[condition.id]]

# Set seed based on that permutation number
set.seed(background.seed + condition.seed + i)

# We load the design matrices, associated meta data, and gvis
# from the random forest classifiers
Xs = readRDS(DESIGN.MATRICES.IN.PATH)
meta = readRDS(META.DATA.IN.PATH)
gvis = readRDS(GVIS.IN.PATH)

# For each design matrix and its associated RF GVIs
perm.pvals = mapply(function(X, gvi) {
  # Repeat for the following for the number of features
  # in the design matrix:
  perm.gvis = sapply(1:ncol(X), function(j) {
    # Permute the response vector
    y.perm = sample(meta$condition)
    # Train a RF model using the permuted
    # response vector, and return the associated gvis
    # for each feature
    get.gvis(X, y.perm, pos = condition)
  })

  # For each feature, get the permutation pvalue, which is the 
  # percent of times when the true gvi for that feature was 
  # lower than the permutation gvis for 
  # that feature in this
  # iteration of the permutation tests
  rowMeans(gvi <= perm.gvis)

}, Xs, gvis, SIMPLIFY = FALSE)

# Save results in the output file for
# this permutation number, as specified
# in the snakefile
saveRDS(perm.pvals, SNAKEMAKE.OUT.PATH)

# Use the permutation number to print 
# confirmation of finishing this permutation
print(i)

R randomForest From line 12 of analysis/run_gvi_permutations.R

set.seed(2020)

# Load packages and source utilities
library(randomForest)
source('scripts/util/Classification/randomForestClassifier.R')

# Set globals
DESIGN.MATRICES.IN.PATH = snakemake@input[[1]]#'Classification/healthy_random_forest_design_matrices_all.RDS'
META.DATA.IN.PATH = snakemake@input[[2]]#'Classification/healthy_random_forest_sample_meta_data_all.RDS'
CONDITION.GROUPS.IN.PATH = snakemake@input[[3]]#Classification/condition_groups.RDS
BACKGROUND.GROUPS.IN.PATH = snakemake@input[[4]]#Classification/background_groups.RDS
PREDICTIONS.OUT.PATH = snakemake@output[[1]]#'Classification/results/healthy_rf_results_all.RDS'
GVIS.OUT.PATH = snakemake@output[[2]]#'Classification/results/healthy_rf_gvis_all.RDS'
MODELS.OUT.PATH = snakemake@output[[3]]#'Classification/results/healthy_rf_models_all.RDS'

# We get the group from the output file name
out.file = basename(DESIGN.MATRICES.IN.PATH)
out.file = gsub('.RDS$', '', out.file, ignore.case = T)
fields = strsplit(out.file,'_')[[1]]
condition.id = fields[1] # The condition group represents the 'positive' condition for the classifier (e.g. 'healthy', 'cgd', 'xcgd','47cgd','stat1.gof')
background.id = fields[length(fields)] # The background group represents the background pool of all other conditions to consider (e.g. 'PID','AI','all')

# We load the condition and background groups
condition.groups = readRDS(CONDITION.GROUPS.IN.PATH)
background.groups = readRDS(BACKGROUND.GROUPS.IN.PATH)

# Get the seed corresponding to each condition so we aren't using the same seeds for each condition
n = 100000
condition.seeds = lapply(condition.groups, function(condition.group){sample.int(n, size = 1)})
background.seeds = lapply(background.groups, function(background.group){sample.int(n, size = 1)})

condition.seed = condition.seeds[[condition.id]]
background.seed = background.seeds[[background.id]]

# Get the conditions to serve as the 'positive' class
condition.group = condition.groups[[condition.id]]

# Set the first seed
set.seed(sample.int(n, size = 1) + condition.seed + background.seed)

# Load data
Xs = readRDS(DESIGN.MATRICES.IN.PATH)
meta = readRDS(META.DATA.IN.PATH)

# Run cross validations on each design matrix
predictions = sapply(Xs, function(X) {
  cross.validation(X, meta$condition, pos = condition.group)
})

# Name the columns and rows of the predictions matrix
colnames(predictions) = names(Xs)
rownames(predictions) = rownames(meta)

# Convert the results matrix to a data frame
predictions = as.data.frame(predictions)

# Reset seed
set.seed(sample.int(n, size = 1) + condition.seed + background.seed)

# Train the random forests with all models
models = lapply(Xs, function(X) {
  get.rf.model(X, meta$condition, pos = condition.group)
})

# Get the gvis from random forests with all samples
gvis = lapply(models, function(model) {
  model$importance[,'MeanDecreaseGini']
})

# Save the results
saveRDS(predictions, PREDICTIONS.OUT.PATH)
saveRDS(gvis, GVIS.OUT.PATH)
saveRDS(models, MODELS.OUT.PATH)

R randomForest From line 13 of analysis/run_random_forests.R

source('scripts/util/Groups/groups.R')

# Set globals
CONDITION.GROUPS.OUT.PATH = snakemake@output[[1]]#'Classification/condition_groups.RDS'
BACKGROUND.GROUPS.OUT.PATH = snakemake@output[[2]]#'Classification/background_groups.RDS'

# Create the list of condition groups (the 'positive' class for the random forest)
# The names correspond to ids for the condition groups (abbreviations used in the file names corresponding to the condition groups)
condition.groups = list(
  'cgd' = c('XCGD', '47CGD'),
  'xcgd' = 'XCGD',
  '47cgd' = '47CGD',
  'stat1' = 'STAT1 GOF',
  'job' = 'Job',
  'fmf' = 'FMF',
  'healthy' = 'Healthy'
)

# Create the list of background groups
# It is okay if the background group contains the condition group, they will be eliminated before creating
# the 'negative' class for the classifier
background.groups = list(
  'AI' = util.get_ai(),
  'PID' = util.get_pid(),
  'all' = c(util.get_pid(), util.get_ai(), util.get_tert_terc())
)

# We eliminate NEMO carriers from the background groups
background.groups = lapply(background.groups, function(background.group) {setdiff(background.group, 'NEMO carrier')})

# Save results
saveRDS(condition.groups, CONDITION.GROUPS.OUT.PATH)
saveRDS(background.groups, BACKGROUND.GROUPS.OUT.PATH)

R From line 6 of preprocessing/create_classifier_groups.R

source('scripts/util/Processing/averageRepeatSamples.R')
source('scripts/util/Groups/groups.R')
library(Biobase)

# We set the global paths
ESET.IN.PATHS = list(
  somalogic.features = snakemake@input[[1]], #"Data/Somalogic/analysis_output/stability/stable_somalogic_sample_level_features.rds"
  somalogic.modules = snakemake@input[[2]], #"Data/Somalogic/analysis_output/stability/stable_somalogic_sample_level_modules.rds"
  microarray.features = snakemake@input[[3]], #"Data/Microarray/analysis_output/stability/stable_microarray_sample_level_features.rds"
  microarray.modules = snakemake@input[[4]], #"Data/Microarray/analysis_output/stability/stable_microarray_sample_level_modules.rds"
  tbnks = snakemake@input[[5]] #"Data/TBNK/analysis_output/stability/stable_tbnk_sample_level_features.rds"
)

CONDITION.GROUPS.IN.PATH = snakemake@input[[6]] #"Classification/condition_groups.RDS"
BACKGROUND.GROUPS.IN.PATH = snakemake@input[[7]] #"Classification/background_groups.RDS"
SOMALOGIC.MODULES.IN.PATH = snakemake@input[[8]] #"Data/Somalogic/analysis_output/wgcna_results/modules.rds"

DESIGN.MATRIX.OUT.PATH = snakemake@output[[1]] #'Classification/healthy_all_design_matrices_all.RDS'
META.DATA.OUT.PATH = snakemake@output[[2]] #'Classification/healthy_random_forest_sample_meta_data_all.RDS'

# We get the group from the output file name
out.file = basename(DESIGN.MATRIX.OUT.PATH)
out.file = gsub('.RDS$', '', out.file, ignore.case = T)
fields = strsplit(out.file,'_')[[1]]
condition.id = fields[1] # The condition group represents the 'positive' condition for the classifier (e.g. 'healthy', 'cgd', 'xcgd','47cgd','stat1.gof')
background.id = fields[length(fields)] # The background group represents the background pool of all other conditions to consider (e.g. 'PID','AI','all')

# Get the conditons to investigate corresponding to the 'condition' field.
condition.groups = readRDS(CONDITION.GROUPS.IN.PATH)
condition.group = condition.groups[[condition.id]]

# Get the background groups to investiate corresponding to the 'background field'
background.groups = readRDS(BACKGROUND.GROUPS.IN.PATH)
background.group = background.groups[[background.id]]

# We load the relevant data
esets = lapply(ESET.IN.PATHS, readRDS)
somalogic.module.memberships = readRDS(SOMALOGIC.MODULES.IN.PATH)

# We make an eset with just the grey somalogic proteins
somalogic.grey.module = names(somalogic.module.memberships)[somalogic.module.memberships == 'grey']
somalogic.features.eset = esets[['somalogic.features']]
somalogic.grey.eset = somalogic.features.eset[rownames(somalogic.features.eset) %in% somalogic.grey.module, ]
esets[['somalogic.grey']] = somalogic.grey.eset

# We find the visit ids shared by all the data types
patient.ids = Reduce(intersect, lapply(esets, function(x) {x$patient_id}))

# For each eset we
esets = lapply(esets, function(eset) {
  # We subset the data to include only the relevant visits
  eset = eset[, eset$patient_id %in% patient.ids];
  # We average over visit ids from the same patient in the expression sets
  eset = averageRepeatSamples(eset, meta.cols = c('condition','race','gender'))
  # We subset to just the condition of interest and background group
  eset = eset[, eset$condition %in% c(condition.group, background.group)]
  # We rearrange each data set to have subjects in the same order
  eset = eset[, order(eset$patient_id)]
})

# We extract matrices from the esets
Xs = lapply(esets, function(eset) {
  # We create several design matrices with the different features we wish to investigate
  X = t(exprs(eset))
})

# We prefix all of the rownames of each matrix with the revelant data type
Xs = mapply(function(X, name) {colnames(X) = paste(name, colnames(X), sep = '.'); return(X)}, Xs, names(Xs))

# We create a multimodal set with all module scores and tbnks
Xs[['all.modules.with.tbnks']] = cbind(Xs[['microarray.modules']],
                                       Xs[['somalogic.modules']],
                                       Xs[['tbnks']])

# We create a multimodal set with all module scores, grey proteins, and tbnks
Xs[['all.modules.plus.grey.with.tbnks']] = cbind(Xs[['microarray.modules']],
                                                 Xs[['somalogic.modules']],
                                                 Xs[['somalogic.grey']],
                                                 Xs[['tbnks']])

# Here, we add a CBC matrix with just CBC parameters (no lymphocyte phenotyping)
# First, we extract the tbnks matrix
X.tbnks = Xs[['tbnks']]
# We get all the possible absolute and relative features from the lymphocyte populations
tbnk.specific = c('cd3', 'cd4_cd3', 'cd8_cd3', 'cd19', 'nk_cells')
tbnk.specific = c(paste0(tbnk.specific, '_abs'), paste0(tbnk.specific, '_percent'))
tbnk.specific = paste0('tbnks.', tbnk.specific)
# We remove these features from the full set of tbnks features to get cbc-specific features
cbc.specific = setdiff(colnames(X.tbnks), tbnk.specific)
# We subset the tbnks matrix to these features
X.cbcs = X.tbnks[, cbc.specific]
# We add the cbc-specific features matrix to the list of matrices
Xs[['cbcs']] = X.cbcs

# We extract the meta data for the samples
# We choose to use the age from the average of their TBNKs. The ages of a patient
# should be very similar across data types so an arbitrary decision is made.
meta = pData(esets$tbnks)

# We save the data
saveRDS(Xs, file = DESIGN.MATRIX.OUT.PATH)
saveRDS(meta, file = META.DATA.OUT.PATH)

R Biobase From line 12 of preprocessing/create_design_matrices.R

if(!exists("snakemake")){
  setwd("../../..")
  source("scripts/util/paper/parse_snakemake.R")
  parse_snakemake(rule = "design_mat_no_pm2")

}

IN.PATH <- snakemake@input[[1]]

OUT.PATH <- snakemake@output[[1]]

design_mat_list <- readRDS(IN.PATH)

#design_mat_list <- design_mat_list[c(4, 5, 6)]

#remove pm2 somalogic.modules.purple
for(nm in names(design_mat_list)){
  m <- design_mat_list[[nm]]
  m <- m[, setdiff(colnames(m), "somalogic.modules.purple")]
  design_mat_list[[nm]] <- m
}

saveRDS(design_mat_list, OUT.PATH)

R Snakemake From line 2 of preprocessing/design_mat_no_pm2.R

DESIGN.MATRICES.IN.PATH = snakemake@input[[1]]#'Classification/healthy_design_matrices_all.RDS'
META.DATA.IN.PATH = snakemake@input[[2]]#'Classification/healthy_random_forest_sample_meta_data_all.RDS'

DESIGN.MATRICES.OUT.PATH = snakemake@output[[1]]#'Classification/healthy_random_forest_design_matrices_all.RDS'

# Read in design matrices and meta data
Xs = readRDS(DESIGN.MATRICES.IN.PATH)
meta = readRDS(META.DATA.IN.PATH)

# Subset the design matrices to those we wish to use for the random forest
features = c('cbcs',
             'tbnks',
             'microarray.modules',
             'somalogic.modules',
             'all.modules.with.tbnks',
             'all.modules.plus.grey.with.tbnks')

Xs = Xs[features]

# Save design matrices and meta data
saveRDS(Xs, DESIGN.MATRICES.OUT.PATH)

R From line 9 of preprocessing/subset_design_matrices.R

set.seed(1909)

# Load libraries
library(limma)
library(Biobase)
library(dplyr)

# Source utilities
source('scripts/util/DifferentialExpression/limma.R')

# Set globals
## Path to esets used for fitting the model
ESET.IN.PATHS = list(
  somalogic.modules = snakemake@input[[1]],#'Data/Somalogic/analysis_output/wgcna_results/scores_sample_level.rds',
  somalogic.features = snakemake@input[[2]],#'Data/Somalogic/data_analysis_ready/analysis_ready_sample_level_training_somalogic.rds',
  microarray.modules = snakemake@input[[3]],#'Data/Microarray/analysis_output/WGCNA/array_sample_scores.rds',
  microarray.features = snakemake@input[[4]],#'Data/Microarray/data_analysis_ready/eset_batch_training_sample.rds',
  tbnks.features = snakemake@input[[5]]#'Data/TBNK/data_analysis_ready/tbnk_training_sample_level_eset.rds'
)

## Where to save the fitted limma models
FIT.OUT.PATHS = list(
  somalogic.modules = snakemake@output[[1]],#'Data/Somalogic/analysis_output/differential_expression/somalogic_modules_DE_fit.rds',
  somalogic.features = snakemake@output[[2]],#'Data/Somalogic/analysis_output/differential_expression/somalogic_features_DE_fit.rds',
  microarray.modules = snakemake@output[[3]],#'Data/Microarray/analysis_output/differential_expression/microarray_modules_DE_fit.rds',
  microarray.features = snakemake@output[[4]],#'Data/Microarray/analysis_output/differential_expression/microarray_features_DE_fit.rds',
  tbnks.features = snakemake@output[[5]]#'Data/TBNK/analysis_output/differential_expression/tbnks_features_DE_fit.rds'
)

# Load esets
esets = lapply(ESET.IN.PATHS, readRDS)

# Instantiate a function to get stats associated with stable features of an eset
get_fit = function(eset) {

  # Scale eset
  exprs(eset) = t(scale(t(exprs(eset))))

  # Get design matrix
  design = make_design(eset)

  # Fit limma model
  fit = fit_limma(eset, design)

  return(fit)
}

# Apply the function over all esets
fits = lapply(esets, get_fit)

# Save fits
mapply(function(fit, out.path) {
  saveRDS(fit, out.path)
}, fits, FIT.OUT.PATHS)

R dplyr Biobase limma From line 5 of analysis/get_DE_fits.R

set.seed(1909)

# Load libraries
library(limma)
library(Biobase)
library(dplyr)

# Source utilities
source('scripts/util/DifferentialExpression/limma.R')

# Set globals
## Path to esets used for fitting the model
ESET.IN.PATHS = list(
  somalogic.modules = snakemake@input[[1]],#'Data/Somalogic/analysis_output/wgcna_results/scores_sample_level.rds',
  somalogic.features = snakemake@input[[2]],#'Data/Somalogic/data_analysis_ready/analysis_ready_sample_level_training_somalogic.rds',
  microarray.modules = snakemake@input[[3]],#'Data/Microarray/analysis_output/WGCNA/array_sample_scores.rds',
  microarray.features = snakemake@input[[4]],#'Data/Microarray/data_analysis_ready/eset_batch_training_sample.rds',
  tbnks.features = snakemake@input[[5]]#'Data/TBNK/data_analysis_ready/tbnk_training_sample_level_eset.rds'
)

MEDS.IN.PATH = snakemake@input[[6]]#'Medications/medications.types.rds'

## Where to save the fitted limma models
FIT.OUT.PATHS = list(
  somalogic.modules = snakemake@output[[1]],#'Data/Somalogic/analysis_output/differential_expression/somalogic_modules_DE_fit.rds',
  somalogic.features = snakemake@output[[2]],#'Data/Somalogic/analysis_output/differential_expression/somalogic_features_DE_fit.rds',
  microarray.modules = snakemake@output[[3]],#'Data/Microarray/analysis_output/differential_expression/microarray_modules_DE_fit.rds',
  microarray.features = snakemake@output[[4]],#'Data/Microarray/analysis_output/differential_expression/microarray_features_DE_fit.rds',
  tbnks.features = snakemake@output[[5]]#'Data/TBNK/analysis_output/differential_expression/tbnks_features_DE_fit.rds'
)

# Load esets
esets = lapply(ESET.IN.PATHS, readRDS)

# Load medications
meds = readRDS(MEDS.IN.PATH)

# Remove patients who were on gamma at any time point
## Get the visits during which patients were on subject
gamma.subjects = unique(meds$patient_id[meds$IFN.gamma])

## Remove the subjects from all the esets
esets = lapply(esets, function(eset) { eset[, !eset$patient_id %in% gamma.subjects] })

# Instantiate a function to get stats associated with stable features of an eset
get_fit = function(eset) {

  # Scale eset
  exprs(eset) = t(scale(t(exprs(eset))))

  # Get design matrix
  design = make_design(eset)

  # Fit limma model
  fit = fit_limma(eset, design)

  return(fit)
}

# Apply the function over all esets
fits = lapply(esets, get_fit)

# Save fits
mapply(function(fit, out.path) {
  saveRDS(fit, out.path)
}, fits, FIT.OUT.PATHS)

R dplyr Biobase limma From line 6 of analysis/get_DE_fits_without_gamma_subjects.R

set.seed(1704)

# Load libraries and source utilities
library(limma)
library(Biobase)
library(dplyr)
source('scripts/util/DifferentialExpression/limma.R')

# Set globals
## Variance partitions for accessing stable features
VP.IN.PATHS = list(
  somalogic.modules = snakemake@input[[1]],#'Data/Somalogic/analysis_output/stability/somalogic_modules_standard_vp.rds',
  somalogic.features = snakemake@input[[2]],#'Data/Somalogic/analysis_output/stability/somalogic_features_standard_vp.rds',
  microarray.modules = snakemake@input[[3]],#'Data/Microarray/analysis_output/stability/microarray_modules_standard_vp.rds',
  microarray.features = snakemake@input[[4]],#'Data/Microarray/analysis_output/stability/microarray_features_standard_vp.rds',
  tbnks.features = snakemake@input[[5]]#'Data/TBNK/analysis_output/stability/tbnks_features_standard_vp.rds'
)

## DE model fits
FIT.IN.PATHS = list(
  somalogic.modules = snakemake@input[[6]],#'Data/Somalogic/analysis_output/differential_expression/somalogic_modules_DE_fit.rds',
  somalogic.features = snakemake@input[[7]],#'Data/Somalogic/analysis_output/differential_expression/somalogic_features_DE_fit.rds',
  microarray.modules = snakemake@input[[8]],#'Data/Microarray/analysis_output/differential_expression/microarray_modules_DE_fit.rds',
  microarray.features = snakemake@input[[9]],#'Data/Microarray/analysis_output/differential_expression/microarray_features_DE_fit.rds',
  tbnks.features = snakemake@input[[10]]#'Data/TBNK/analysis_output/differential_expression/tbnks_features_DE_fit.rds'
)

## Where to place the statistics derived in this script (just stable features)
RESULT.OUT.PATHS = list(
  somalogic.modules = snakemake@output[[1]],#'Data/Somalogic/analysis_output/differential_expression/somalogic_modules_DE_results.rds',
  somalogic.features = snakemake@output[[2]],#'Data/Somalogic/analysis_output/differential_expression/somalogic_features_DE_results.rds',
  microarray.modules = snakemake@output[[3]],#'Data/Microarray/analysis_output/differential_expression/microarray_modules_DE_results.rds',
  microarray.features = snakemake@output[[4]],#'Data/Microarray/analysis_output/differential_expression/microarray_features_DE_results.rds',
  tbnks.features = snakemake@output[[5]]#'Data/TBNK/analysis_output/differential_expression/tbnks_features_DE_results.rds'
)

# Where to place the intermediate results (with all features)
INTERMEDIATE.OUT.PATHS = list(
  somalogic.modules = snakemake@output[[6]],#'Data/Somalogic/analysis_output/differential_expression/somalogic_modules_DE_intermediates.rds',
  somalogic.features = snakemake@output[[7]],#'Data/Somalogic/analysis_output/differential_expression/somalogic_features_DE_intermediates.rds',
  microarray.modules = snakemake@output[[8]],#'Data/Microarray/analysis_output/differential_expression/microarray_modules_DE_intermediates.rds',
  microarray.features = snakemake@output[[9]],#'Data/Microarray/analysis_output/differential_expression/microarray_features_DE_intermediates.rds',
  tbnks.features = snakemake@output[[10]]#'Data/TBNK/analysis_output/differential_expression/tbnks_features_DE_intermediates.rds'
)

# Load fits
fits = lapply(FIT.IN.PATHS, readRDS)

# Load vps
vps = lapply(VP.IN.PATHS, readRDS)

# Get list of condition groups we wish to analyze (note that the *CONDITIONS variables are globals
# from the utility script)
groups = c(make_condition_groups(conditions = CONDITIONS),
           make_AI_PID_groups(ai.conditions = AI_CONDITIONS,
                              pid.conditions = PID_CONDITIONS,
                              tert.terc.conditions = TERT_TERC_CONDITIONS))

# Instantiate a function to get stats associated with stable features of an eset
run_stats = function(fit, vp) {

  # For the versus-healthy and versus-all analysis
  stats = lapply(c(FALSE, TRUE), function(cross) {

    # Design contrast matrix using the specified groups
    contrast = make_contrasts_mat(fit, groups, cross)

    # Fit the contrast matrix
    contrast.fit = contrasts.fit(fit, contrast)

    # Use ebayes or traditional t-test based on number of features
    if(nrow(vp) < 100) {
      stats = get_traditional_stats(contrast.fit)
    } else {
      stats = get_ebayes_stats(contrast.fit)
    }

    # Add an adjusted pvalue using FDR
    stats$adj.P.Val = apply(stats$P.Value, 2, function(x) {
      p.adjust(x, 'fdr')
    })

    return(stats)

  })

  names(stats) = c('versus.healthy', 'versus.all')
  return(stats)
}

# Instantiate a function to subset statistics from a data type to just the stable features
subset_stats = function(statss, vp) {

  # Get stable feautres
  stable.features = vp@row.names[vp$Patient >= .5]

  # For each of the options (versus healthy and versus all)
  statss.stable = lapply(statss, function(stats) {

    # For each statistic
    stats.stable = lapply(stats, function(stat) {

      # Subset the statistic to the stable features
      stat[stable.features, ]

    })

    # Add an adjusted pvalue, only among the stable features
    stats.stable$stable.feature.adj.P.Val = apply(stats.stable$P.Value, 2, function(x) {

      p.adjust(x, 'fdr')

    })


    return(stats.stable)

  })
}

# Apply the stat-computing function over all esets and vps
results = mapply(run_stats, fits, vps, SIMPLIFY = FALSE)

# Apply the stability filtering function over all esets and vps
results.stable = mapply(subset_stats, results, vps, SIMPLIFY = FALSE)

# Save intermediates
mapply(function(result, out.path) {
  saveRDS(result, out.path)
}, results, INTERMEDIATE.OUT.PATHS)

# Save stats
mapply(function(result, out.path) {
  saveRDS(result, out.path)
}, results.stable, RESULT.OUT.PATHS)

R dplyr Biobase From line 5 of analysis/get_DE_stats.R

library(limma)
library(Biobase)
library(dplyr)
source('scripts/util/DifferentialExpression/limma.R')

# Set seed		
set.seed(19020)

# Set globals
## Path to esets used for fitting the model
ESET.IN.PATHS = list(
  somalogic.modules = snakemake@input[[1]],#'Data/Somalogic/analysis_output/wgcna_results/scores_sample_level.rds',
  somalogic.features = snakemake@input[[2]],#'Data/Somalogic/data_analysis_ready/analysis_ready_sample_level_training_somalogic.rds',
  microarray.modules = snakemake@input[[3]],#'Data/Microarray/analysis_output/WGCNA/array_sample_scores.rds',
  microarray.features = snakemake@input[[4]],#'Data/Microarray/data_analysis_ready/eset_batch_training_sample.rds',
  tbnks.features = snakemake@input[[5]]#'Data/TBNK/data_analysis_ready/tbnk_training_sample_level_eset.rds'
)

## Where to save the fitted sex-linked DE 
limma models
FIT.OUT.PATHS = list(
  somalogic.modules = snakemake@output[[1]],#'Data/Somalogic/analysis_output/sex_related_de_signatures/somalogic_modules_DE_sex_linked_fit.rds',
  somalogic.features = snakemake@output[[2]],#'Data/Somalogic/analysis_output/sex_related_de_signatures/somalogic_features_DE_sex_linked_fit.rds',
  microarray.modules = snakemake@output[[3]],#'Data/Microarray/analysis_output/sex_related_de_signatures/microarray_modules_DE_sex_linked_fit.rds',
  microarray.features = snakemake@output[[4]],#'Data/Microarray/analysis_output/sex_related_de_signatures/microarray_features_DE_sex_linked_fit.rds',
  tbnks.features = snakemake@output[[5]]#'Data/TBNK/analysis_output/sex_related_de_signatures/tbnks_features_DE_sex_linked_fit.rds'
)

# Load esets
esets = lapply(ESET.IN.PATHS, readRDS)

# Instantiate a function to get stats associated with stable features of an eset
get_fit = function(eset, condition) {

  eset = eset[, eset$condition %in% c(condition, 'Healthy')]

  # Scale eset
  exprs(eset) = t(scale(t(exprs(eset))))

  # Get design matrix
  design = make_sex_linked_design(eset)

  # Fit limma model
  fit = fit_limma(eset, design)

  return(fit)
}

get_fits = function(eset) {
  conditions = c('STAT1 GOF', '47CGD', 'Job')
  fits = lapply(conditions, get_fit, eset = eset)
  names(fits) = conditions
  return(fits)
}

# Apply the function over all esets
fits = lapply(esets, get_fits)

# Save fits
mapply(function(fit, out.path) {
  saveRDS(fit, out.path)
}, fits, FIT.OUT.PATHS)

R dplyr Biobase limma From line 5 of analysis/get_sex_based_DE_fits.R

library(limma)
library(Biobase)
library(dplyr)
source('scripts/util/DifferentialExpression/sex_related_DE.R')

# Set seed		
set.seed(1709)

# Set globals
## Variance partitions for accessing stable features
VP.IN.PATHS = list(
  somalogic.modules = snakemake@input[[1]],#'Data/Somalogic/analysis_output/stability/somalogic_modules_standard_vp.rds',
  somalogic.features = snakemake@input[[2]],#'Data/Somalogic/analysis_output/stability/somalogic_features_standard_vp.rds',
  microarray.modules = snakemake@input[[3]],#'Data/Microarray/analysis_output/stability/microarray_modules_standard_vp.rds',
  microarray.features = snakemake@input[[4]],#'Data/Microarray/analysis_output/stability/microarray_features_standard_vp.rds',
  tbnks.features = snakemake@input[[5]]#'Data/TBNK/analysis_output/stability/tbnks_features_standard_vp.rds'
)

## Sex-linked DE model fits
FIT.IN.PATHS = list(
  somalogic.modules = snakemake@input[[6]],#'Data/Somalogic/analysis_output/sex_related_de_signatures/somalogic_modules_DE_sex_linked_fit.rds',
  somalogic.features = snakemake@input[[7]],#'Data/Somalogic/analysis_output/sex_related_de_signatures/somalogic_features_DE_sex_linked_fit.rds',
  microarray.modules = snakemake@input[[8]],#'Data/Microarray/analysis_output/sex_related_de_signatures/microarray_modules_DE_sex_linked_fit.rds',
  microarray.features = snakemake@input[[9]],#'Data/Microarray/analysis_output/sex_related_de_signatures/microarray_features_DE_sex_linked_fit.rds',
  tbnks.features = snakemake@input[[10]]#'Data/TBNK/analysis_output/sex_related_de_signatures/tbnks_features_DE_sex_linked_fit.rds'
)

## Where to place the statistics derived in this script
RESULT.OUT.PATHS = list(
  somalogic.modules = snakemake@output[[1]],#'Data/Somalogic/analysis_output/sex_related_de_signatures/somalogic_modules_DE_results.rds',
  somalogic.features = snakemake@output[[2]],#'Data/Somalogic/analysis_output/sex_related_de_signatures/somalogic_features_DE_results.rds',
  microarray.modules = snakemake@output[[3]],#'Data/Microarray/analysis_output/sex_related_de_signatures/microarray_modules_DE_results.rds',
  microarray.features = snakemake@output[[4]],#'Data/Microarray/analysis_output/sex_related_de_signatures/microarray_features_DE_results.rds',
  tbnks.features = snakemake@output[[5]]#'Data/TBNK/analysis_output/sex_related_de_signatures/tbnks_features_DE_results.rds'
)

## Where to place the intermediate results (with all features)
INTERMEDIATE.OUT.PATHS = list(
  somalogic.modules = snakemake@output[[6]],#'Data/Somalogic/analysis_output/sex_related_de_signatures/somalogic_modules_DE_intermediates.rds',
  somalogic.features = snakemake@output[[7]],#'Data/Somalogic/analysis_output/sex_related_de_signatures/somalogic_features_DE_intermediates.rds',
  microarray.modules = snakemake@output[[8]],#'Data/Microarray/analysis_output/sex_related_de_signatures/microarray_modules_DE_intermediates.rds',
  microarray.features = snakemake@output[[9]],#'Data/Microarray/analysis_output/sex_related_de_signatures/microarray_features_DE_intermediates.rds',
  tbnks.features = snakemake@output[[10]]#'Data/TBNK/analysis_output/sex_related_de_signatures/tbnks_features_DE_intermediates.rds'
)

# Load fits
fits = lapply(FIT.IN.PATHS, readRDS)

# Load vps
vps = lapply(VP.IN.PATHS, readRDS)

# Get list of condition groups we wish to analyze (note that the *CONDITIONS variables are globals
# from the utility script)

# Instantiate a function to get stats associated with stable features of an eset
run_stats = function(fits, vp) {

  stats = lapply(fits, function(fit) {

    # Use ebayes or traditional t-test based on number of features
    if(nrow(vp) < 100) {
      stats = get_traditional_stats(fit)
    } else {
      stats = get_ebayes_stats(fit)
    }

    # Add an adjusted pvalue using FDR
    stats$adj.P.Val = apply(stats$P.Value, 2, function(x) {
      p.adjust(x, 'fdr')
      return(x)
    })

    return(stats)

  })

  names(stats) = names(fits)
  return(stats)
}

# Instantiate a function to subset statistics from a data type to just the stable features
subset_stats = function(statss, vp) {

  # Get stable feautres
  stable.features = vp@row.names[vp$Patient >= .5]

  # For each condition
  stats.stable = lapply(statss, function(stats) {

    # For each statistic
    stats.stable = lapply(stats, function(stat) {

      # Subset the statistic to the stable features
      stat[stable.features, ]

    })

    # Add an adjusted pvalue, only among the stable features
    stats.stable$adj.P.Val = apply(stats.stable$P.Value, 2, function(x) {

      p.adjust(x, 'fdr')
      return(x)

    })

    return(stats.stable)
  })

  return(stats.stable)
}

# Apply the stat-computing function over all esets and vps
results = mapply(run_stats, fits, vps, SIMPLIFY = FALSE)

# Apply the stability filtering function over all esets and vps
results.stable = mapply(subset_stats, results, vps, SIMPLIFY = FALSE)

# Save intermediates
mapply(function(result, out.path) {
  saveRDS(result, out.path)
}, results, INTERMEDIATE.OUT.PATHS)

# Save stats
mapply(function(result, out.path) {
  saveRDS(result, out.path)
}, results.stable, RESULT.OUT.PATHS)

R dplyr Biobase From line 6 of analysis/get_sex_based_DE_stats.R

library(limma)

#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")
#Source Scripts ----------------------------------------------------------
source("scripts_nick/util/Enrichment/camera.R")
#Set paths ---------------------------------------------------------------
#Inputs
FIT.IN.PATH <- "Data/Microarray/analysis_output/differential_expression/microarray_features_DE_fit.rds"
COMBINED.GENESETS.IN.PATH <- "Gene_sets/processed/combined_gene_sets.RDS"

#Out path
ENRICHMENT.DAT.OUT.PATH <- "Data/Microarray/analysis_output/differential_expression/enrichment/cameraPR_enrichment_list.rds" 
dir.create(dirname(ENRICHMENT.DAT.OUT.PATH), recursive = T)

#Load data ---------------------------------------------------------------
fit <- readRDS(FIT.IN.PATH)
genesetLL <- readRDS(COMBINED.GENESETS.IN.PATH)

#remove that t-statistics for genes without name
fit <- eBayes(fit)
tstat.dat <- fit$t
tstat.dat <- tstat.dat[!is.na(rownames(tstat.dat)), ]

#concatenate geneset list into single list
names(genesetLL$reactome) <- paste0("reactome_", names(genesetLL$reactome))
names(genesetLL$btms) <- paste0("btm_", names(genesetLL$btms))

geneset.list <- Reduce(c, genesetLL)

IFN.I.Dcact <- c("ATF3", "CCL8", "CXCL10", "DDX58", "DDX60", "DHX58", "EIF2AK2", 
                 "HERC5", "IFI27", "IFIH1", "IFIT1", "IFIT2", "IFIT3", "IRF7",
                 "LAMP3", "MX2", "OAS1", "OAS3", "OASL", "PARP9", "PLSCR1", "PML", 
                 "RSAD2", "SERPING1", "SP100", "TAP1")
geneset.list <- c(geneset.list, list(baseline_IFN.I.Dcact = IFN.I.Dcact))

#Run enrichment -----------------------------------------------------------
universe <- rownames(tstat.dat)
indices <- make_indices(geneset.list, universe, 5)

# keep only the condition ceofficients
tstat.dat <- tstat.dat[, grep("group", colnames(tstat.dat))]
colnames(tstat.dat) <- gsub("group", "", colnames(tstat.dat))

enrich.dat.list <- lapply(colnames(tstat.dat), function(col.name){
  tstat <- tstat.dat[, col.name]

  enrich.dat <- cameraPR(tstat, indices, use.ranks = TRUE, sort = FALSE)
  enrich.dat$geneset <- names(indices)
  geneset.db <- sapply(strsplit(enrich.dat$geneset, "_"), `[[`, 1)
  enrich.dat$geneset.db <- geneset.db

  return(enrich.dat)
})
names(enrich.dat.list) <- colnames(tstat.dat)

#Save ----------------------------------------------------------------------
saveRDS(enrich.dat.list, ENRICHMENT.DAT.OUT.PATH)

R From line 1 of feature_geneset_enrichments/run_enrich_camera.R

source('scripts/util/Enrichment/hyperGeo.R')

# Set globals
## Microarray module memberships path
MODULES.IN.PATH = snakemake@input[[1]]#'Data/Microarray/analysis_output/WGCNA/modules.rds'
## List of gene sets paths
GENE.SETS.IN.PATH = snakemake@input[[2]]#'Gene_sets/processed/combined_gene_sets.RDS'

## Place to save microarray module enrichments
ENRICHMENTS.OUT.PATH = snakemake@output[[1]]#'Data/Microarray/analysis_output/enrichments/microarray_module_gene_set_enrichments.RDS'

# Load data
modules = readRDS(MODULES.IN.PATH)
gene.sets = readRDS(GENE.SETS.IN.PATH)

# Get set of all genes
universe = names(modules)

# Get all module colors
module_colors = unique(modules)

# Apply enrichment function to all modules
## For each module
enrichments = lapply(module_colors, function(module) {
  ## Make the hits the set of genes in the module
  hits = names(modules)[modules == module]
  ## Test to see if any gene set is enriched for these hits
  multiHyperGeoTests(gene.sets, universe, hits, minGeneSetSize = 5, pAdjustMethod = "BH")
})

# Name the enrichments based upon their corresponding module
names(enrichments) = module_colors

# Save results
saveRDS(enrichments, ENRICHMENTS.OUT.PATH)

R From line 4 of analysis/microarray_modules_gene_set_enrichments.R

library(Biobase)

# Source utilities
source('scripts/util/Enrichment/hyperGeo.R')
source('scripts/util/Enrichment/proteinToGeneConversion.R')

# Set globals
## Somalogic module memberships
MODULES.IN.PATH = snakemake@input[[1]]#'Data/Somalogic/analysis_output/wgcna_results/modules.rds'
## Somalogic feature level eset, from which to get the fData for the somamers
ESET.IN.PATH = snakemake@input[[2]]#'Data/Somalogic/data_analysis_ready/analysis_ready_subject_level_training_somalogic.rds'
## The gene sets we wish to investigate for enrichments
GENE.SETS.IN.PATH = snakemake@input[[3]]#'Gene_sets/processed/combined_gene_sets.RDS'
## The tissue specific gene sets we wish to investigate for tissue enrichments
TISSUE.SETS.IN.PATH = snakemake@input[[4]]#'Gene_sets/processed/tissue_gene_sets.RDS'

## The location in which we wish to save the gene set enrichments for the somalogic
GENE.SETS.ENRICHMENTS.OUT.PATH = snakemake@output[[1]]#'Data/Somalogic/analysis_output/enrichments/somalogic_module_gene_set_enrichments.RDS'
## The location in which we wish to save the tissue enrichments for the somalogic
TISSUE.SETS.ENRICHMENTS.OUT.PATH = snakemake@output[[2]]#'Data/Somalogic/analysis_output/enrichments/somalogic_module_tissue_set_enrichments.RDS'

# Load data
modules = readRDS(MODULES.IN.PATH)
eset = readRDS(ESET.IN.PATH)
general.sets = readRDS(GENE.SETS.IN.PATH)
tissue.sets = readRDS(TISSUE.SETS.IN.PATH)

# Make a list combining tissue and gene sets for easy vectorization
gene.set.list = list(general = general.sets, tissue = tissue.sets)

# Get a map converting protein names to corresponding gene names based on the fData in somalogic
protein.to.gene.map = make_protein_to_gene_map(eset)

# Subset module to proteins in the map (i.e. those that correspond to only one gene)
modules = modules[names(modules) %in% names(protein.to.gene.map)]

# Get all the module colors
module_colors = unique(modules)

# Get the enrichments
## For both tissue gene sets and the normal gene sets
enrichments = lapply(gene.set.list, function(gene.sets) {
  ## For each module
  enrichments = lapply(module_colors, function(module_color){
    ## Get the proteins in the module
    module = names(modules)[modules == module_color]
    ## Get each gene for which all its corresponding proteins fall into or out of the module
    universe = get_coherent_genes(protein.to.gene.map, module)
    ## Extract the genes for which all corresponding proteins fall into the module
    hits = intersect(universe, protein.to.gene.map[module])
    ## Run gene set tests on the 'coherent' genes found above
    gene.sets.enrichments = multiHyperGeoTests(gene.sets, universe, hits, minGeneSetSize = 5, pAdjustMethod = "BH")
  })
  ## Name the enrichments based on the module
  names(enrichments) = module_colors
  ## Return result
  return(enrichments)
})

# Split enrichments into the general gene sets and the tissue gene sets and save each
saveRDS(enrichments$general, GENE.SETS.ENRICHMENTS.OUT.PATH)
saveRDS(enrichments$tissue, TISSUE.SETS.ENRICHMENTS.OUT.PATH)

R Biobase From line 9 of analysis/somalogic_modules_gene_set_enrichments.R

library(Biobase)
source('scripts/util/Enrichment/hyperGeo.R')

# Set globals
## The gene surrogate signatures for features of interest
SIGNATURES.IN.PATH = snakemake@input[[1]]#'Classification/transcriptional_surrogates/surrogate_signatures.RDS'
## The microarray subject-level training eset
ESET.IN.PATH = snakemake@input[[2]]#'Data/Microarray/data_analysis_ready/eset_batch_training_subject.rds'
## The gene sets
GENE.SETS.IN.PATH = snakemake@input[[3]]#'Gene_sets/processed/combined_gene_sets.RDS'

## The gene surrogate signature enrichments among the gene sets
ENRICHMENTS.OUT.PATH = snakemake@output[[1]]#'Classification/transcriptional_surrogates/surrogate_enrichments.RDS'

# Load data
signatures = readRDS(SIGNATURES.IN.PATH)
microarray = readRDS(ESET.IN.PATH)
gene.sets = readRDS(GENE.SETS.IN.PATH)

# Get gene universe
universe = rownames(microarray)

# We get enrichments from all genes
results = lapply(signatures, function(signature) {
  result = lapply(signature, function(half.signature) {
    print(length(universe))
    multiHyperGeoTests(gene.sets, universe, half.signature, minGeneSetSize = 5)
  })
  names(result) = names(signature)
  return(result)
})

names(results) = names(signatures)

saveRDS(results, ENRICHMENTS.OUT.PATH)

R Biobase From line 4 of analysis/transcriptional_signature_enrichments.R

source('scripts/util/Enrichment/gmtEnrichment.R')

# Set the paths
## For the GMTs to read in
GMT.IN.PATHS = list(
  reactome = snakemake@input[[1]],#'Gene_sets/raw/GMTs/reactome.gmt',
  go.bp = snakemake@input[[2]],#'Gene_sets/raw/GMTs/c5.bp.v6.2.symbols.gmt.txt',
  kegg = snakemake@input[[3]]#'Gene_sets/raw/GMTs/c2.cp.kegg.v6.2.symbols.gmt.txt'
)

## For the file with BTM information
BTMS.IN.PATH = snakemake@input[[4]]#'Gene_sets/raw/BTMs/btm_annotation_table.txt'

## For where to save the gene sets
GENE.SET.OUT.PATHS = list(
  reactome = snakemake@output[[1]],#'Gene_sets/processed/reactome.RDS',
  go.bp = snakemake@output[[2]],#'Gene_sets/processed/go.bp.RDS',
  kegg = snakemake@output[[3]],#'Gene_sets/processed/kegg.RDS',
  btms = snakemake@output[[4]]#'Gene_sets/processed/btm.rds'
) 

## For where to save a list with all the gene sets
GENE.SETS.OUT.PATH = snakemake@output[[5]]#'Gene_sets/processed/combined_gene_sets.RDS'

# Read in the GMTs
gene.sets = lapply(GMT.IN.PATHS, function(in.path) {
  gene.set = read.gmt(in.path)
})

# Get the btms

## Read in data
btm.dat <- read.table(BTMS.IN.PATH, header = TRUE, sep = '\t', comment.char = '', stringsAsFactors = FALSE)

## Save as list and give proper name
btms <- as.list(btm.dat$Module.member.genes)
btm.names <- paste(btm.dat$ID, btm.dat$Module.title, sep = "_")
names(btms) <- btm.names

## Remove commas and save as character vector where each gene is a component of the vector
btms <- lapply(btms, function(x) unlist(strsplit(x, split = ",")))

# Add the btms to the list of gene.sets
gene.sets[['btms']] = btms

# Save the gene sets separately
mapply(function(gene.set, out.path) {saveRDS(gene.set, out.path)}, gene.sets, GENE.SET.OUT.PATHS)

# Save the list of all gene sets
saveRDS(gene.sets, GENE.SETS.OUT.PATH)

R From line 4 of preprocessing/make_gene_sets.R

ATLAS.IN.PATH = snakemake@input[[1]]#'Gene_sets/raw/Human_protein_atlas/proteinatlas.tsv'
## Gene sets for the tissues
GENE.SET.OUT.PATH = snakemake@output[[1]]#'Gene_sets/processed/tissue_gene_sets.RDS'

# Load data frame from protein atlas
df = read.table(ATLAS.IN.PATH, header = TRUE, sep = '\t', comment.char = '', quote = '', stringsAsFactors = FALSE)

# Instantiate set making function for a given enrichment level
make_set = function(enrichment.levels) {

  ## Subset to only enrichments of the desired enrichment.levels
  df.specific = df[df$RNA.tissue.category %in% enrichment.levels, ]

  ## Get the set of all tissues in the HPA
  tissues = df.specific$RNA.TS.TPM
  ## Separate the tissues that may be enriched
  tissues = strsplit(tissues, '\\;')
  ## Parse these tissues to remove associated numerical values
  tissues = sapply(tissues, function(tissue) {
    tissue = sapply(strsplit(tissue, '\\:'), function(x) {x[[1]]})
  })
  ## Only take the unique tissues
  tissues = unique(unlist(tissues))

  ## Get the genes associated with each tissue
  gene_sets = lapply(tissues, function(tissue) {
    tissue = paste0(tissue, ':')
    ### Get the set of all genes that are specific to a tissue
    genes = df.specific$Gene[grepl(tissue, df.specific$RNA.TS.TPM)]
    ### Get the unique set of all these genes
    genes = unique(genes)
  })

  names(gene_sets) = tissues

  return(gene_sets)
}

# Set the enrichment levels we are interested in making gene sets for
levels = list(
  strict = c("Tissue enriched"),
  medium = c("Tissue enriched", "Tissue enhanced"),
  general = c("Tissue enhanced", "Tissue enriched", "Group enriched")
)

# Get the tissue enrichments for each set of enrichment levels
tissue_sets = lapply(levels, make_set)

# Save results
saveRDS(tissue_sets, GENE.SET.OUT.PATH)

R From line 5 of preprocessing/process_protein_atlas.R

suppressPackageStartupMessages({
  library(tidyverse)
  library(limma)
  library(parallel)
  library(Biobase)
  library(BiocGenerics)
})

#Source Scripts ----------------------------------------------------------
#source("scripts/util/Processing/averageRepeatSamples.R")
source("scripts/util/Enrichment/camera.R")
source("scripts/util/JIVE/prcomp_list_varfilter.R")

#Set paths ---------------------------------------------------------------
#Jive
JIVE.PATH <- snakemake@input[["jive"]]#"Integration_output/jive/subject/jive.rds"
JIVE.PC.PATH <- snakemake@input[["jive_pcs"]]#"Integration_output/jive/subject/prcomp_list.rds"

#Data
ARRAY.ESET.PATH <- snakemake@input[["array_eset"]]#"Data/Microarray/data_analysis_ready/eset_batch_training_subject.rds"
SOMA.ESET.PATH <- snakemake@input[["soma_eset"]]#"Data/Somalogic/data_analysis_ready/analysis_ready_sample_level_training_somalogic.rds"

#Geneset paths
TISSUE.IN.PATH <- snakemake@input[["tissue_genesets"]]#"Gene_sets/processed/tissue_gene_sets.RDS"
COMBINED.GENESETS.IN.PATH <- snakemake@input[["genesets"]]#"Gene_sets/processed/combined_gene_sets.RDS"

#Out path
ENRICHMENT.DAT.OUT.PATH <- snakemake@output[[1]]#"Integration_output/jive/subject/pc_enrich_dat_camera.rds"

#Load Jive ---------------------------------------------------------------
#need for the correlation with the array and soma data
jive <- readRDS(JIVE.PATH)

#Load Jive pcs------------------------------------------------------------
prcomp.list <- readRDS(JIVE.PC.PATH)
pdat <- prcomp.list$pdat

#remove unneccessary PC's ----------------------------------------------
prcomp.list <- prcomp_list_varfilter(prcomp.list, .03)
pca.list <- map(prcomp.list, "x") # get the PC's

#load data that has not been filtered for stable features ----------------
array.eset <- readRDS(ARRAY.ESET.PATH)
soma.eset <- readRDS(SOMA.ESET.PATH)

#Load Gene sets ----------------------------------------------------------
genesetLL <- readRDS(COMBINED.GENESETS.IN.PATH)
tissue <- readRDS(TISSUE.IN.PATH)

#create geneset list of lists --------------------------------------------
genesetLL <- c(genesetLL, list(tissue = tissue$general))

#Change the somalogic names so that they are genes not proteins ----------
soma.fdata <- featureData(soma.eset)@data
soma.data <- t(jive$data$soma)
array.data <- t(jive$data$array)
colnames(soma.data) <- 
  soma.fdata$EntrezGeneSymbol[match(colnames(soma.data), soma.fdata$Target)]
in.data.list <- list(array = array.data, soma = soma.data)

#Run enrichment -----------------------------------------------------------
enrich.dat <- lapply(pca.list, function(pca){
  lapply(genesetLL, function(geneset.list){
    lapply(in.data.list, function(in.data){
      cormat <- get_cormat(pca, in.data)

      cameraPR_cor(cormat, geneset.list, 
                   min.geneset.size = 3, use.ranks = FALSE, 
                   abs.cor = F) %>% 
        bind_rows(.id = "PC")
    }) %>% bind_rows(.id = "in.data")
  }) %>% bind_rows(.id = "geneset.db")
}) %>% bind_rows(.id = "pca.data")

#Save ----------------------------------------------------------------------
saveRDS(enrich.dat, ENRICHMENT.DAT.OUT.PATH)

R tidyverse Biobase BiocGenerics From line 7 of enrich/jive_pca_enrich_wdirection.R

library(r.jive)
library(dplyr)
library(Biobase)
library(BiocGenerics)

source("scripts/util/Processing/get_intersecting_data.R")
source("scripts/util/JIVE/JIVE_wrapper.R")

id.col <- "patient_id"

# set paths --------------------------------------------------------- 
#input
ARRAY.ESET.PATH <- snakemake@input[["array_in"]]#"Data/Microarray/analysis_output/stability/stable_microarray_subject_level_features.rds"
SOMA.ESET.PATH <- snakemake@input[["soma_in"]]#"Data/Somalogic/analysis_output/stability/stable_somalogic_subject_level_features.rds"

#output
JIVE.OUT.PATH <- snakemake@output[[1]]#"Integration_output/jive/subject_noHealthy/jive.rds"

if(!dir.exists(dirname(JIVE.OUT.PATH))){
  dir.create(dirname(JIVE.OUT.PATH))
}

# load data ---------------------------------------------------------
array.eset <- readRDS(ARRAY.ESET.PATH)
soma.eset <- readRDS(SOMA.ESET.PATH)

#Subset, remove healthies
array.eset <- array.eset[, array.eset$condition != "Healthy"]
soma.eset <- soma.eset[, soma.eset$condition != "Healthy"]

# get intersecting data ---------------------------------------------
eset.list <- list(array = array.eset, soma = soma.eset)
shared.data <- get_intersecting_data(eset.list) 

#Run Jive ------------------------------------------------------
set.seed(12345) # for reproducibility
jive.out <- jive_wrapper(data.list = shared.data$expr, 
                         z.score = TRUE, frob.scale = TRUE, save.scale.info = TRUE,
                         pdat = shared.data$pdat, 
                         method = "perm",
                         id.col = id.col)



#Save output ----------------------------------------------
saveRDS(jive.out, JIVE.OUT.PATH)

R dplyr Biobase BiocGenerics r.jive From line 5 of jive_run/jive_subject_level_run_noHealthy.R

library(r.jive)
library(dplyr)
library(Biobase)
library(BiocGenerics)

source("scripts/util/Processing/get_intersecting_data.R")
source("scripts/util/JIVE/JIVE_wrapper.R")

id.col <- "patient_id"

# set paths --------------------------------------------------------- 
#input
ARRAY.ESET.PATH <- snakemake@input[["array_in"]]#"Data/Microarray/analysis_output/stability/stable_microarray_subject_level_features.rds"
SOMA.ESET.PATH <- snakemake@input[["soma_in"]]#"Data/Somalogic/analysis_output/stability/stable_somalogic_subject_level_features.rds"

#output
JIVE.OUT.PATH <- snakemake@output[[1]]#"Integration_output/jive/subject_onlyHealthy/jive.rds"

if(!dir.exists(dirname(JIVE.OUT.PATH))){
  dir.create(dirname(JIVE.OUT.PATH))
}

# load data ---------------------------------------------------------
array.eset <- readRDS(ARRAY.ESET.PATH)
soma.eset <- readRDS(SOMA.ESET.PATH)

#Subset, Keep only healthy
array.eset <- array.eset[, array.eset$condition == "Healthy"]
soma.eset <- soma.eset[, soma.eset$condition == "Healthy"]

# get intersecting data ---------------------------------------------
eset.list <- list(array = array.eset, soma = soma.eset)
shared.data <- get_intersecting_data(eset.list) 

tmp <- c( 'patient_id', c('condition', 'race', 'gender', 'Age'))

#Run Jive ------------------------------------------------------
set.seed(100) # for reproducibility
jive.out <- jive_wrapper(data.list = shared.data$expr, 
                         z.score = TRUE, frob.scale = TRUE, save.scale.info = TRUE,
                         pdat = shared.data$pdat, 
                         method = "perm",
                         id.col = id.col)



#Save output ----------------------------------------------
saveRDS(jive.out, JIVE.OUT.PATH)

R dplyr Biobase BiocGenerics r.jive From line 5 of jive_run/jive_subject_level_run_onlyHealthy.R

library(r.jive)
library(dplyr)
library(Biobase)
library(BiocGenerics)

source("scripts/util/Processing/get_intersecting_data.R")
source("scripts/util/JIVE/JIVE_wrapper.R")

id.col <- "patient_id"

# set paths --------------------------------------------------------- 
#input
ARRAY.ESET.PATH <- snakemake@input[["array_in"]]#"Data/Microarray/analysis_output/stability/stable_microarray_subject_level_features.rds"
SOMA.ESET.PATH <- snakemake@input[["soma_in"]]#"Data/Somalogic/analysis_output/stability/stable_somalogic_subject_level_features.rds"

#output
JIVE.OUT.PATH <- snakemake@output[[1]]#"Integration_output/jive/subject/jive.rds"

#if(!dir.exists(dirname(JIVE.OUT.PATH))){
#  dir.create(dirname(JIVE.OUT.PATH))
#}

# load data ---------------------------------------------------------
array.eset <- readRDS(ARRAY.ESET.PATH)
soma.eset <- readRDS(SOMA.ESET.PATH)

# get intersecting data ---------------------------------------------
eset.list <- list(array = array.eset, soma = soma.eset)
shared.data <- get_intersecting_data(eset.list) 

#Run Jive ------------------------------------------------------
set.seed(1) # for reproducibility
jive.out <- jive_wrapper(data.list = shared.data$expr, 
                         z.score = TRUE, frob.scale = TRUE, save.scale.info = TRUE,
                         pdat = shared.data$pdat, 
                         method = "perm",
                         id.col = id.col)

#Save output ----------------------------------------------
saveRDS(jive.out, JIVE.OUT.PATH)

R dplyr Biobase BiocGenerics r.jive From line 5 of jive_run/jive_subject_level_run.R

source("scripts/util/JIVE/JIVE_pca.R")

#Inputs -----------------------------------------------------
JIVE.PATH <- snakemake@input[["jive_obj"]]

#Outputs ----------------------------------------------------
OUT.PRCOMP.LIST.PATH <- snakemake@output[[1]]

#load data ---------------------------------------------------------
jive <- readRDS(JIVE.PATH)

#Run PCA ------------------------------------------------------------------
prcomp.list <- get_jive_pca(jive)

#Flip PC1 of the joint so that correlated with healthy index
prcomp.list$joint$x[, "PC1"] <- -prcomp.list$joint$x[, "PC1"]

#Save --------------------------------------------------------------
saveRDS(prcomp.list, OUT.PRCOMP.LIST.PATH)

R From line 7 of pca_run/jive_pca_run.R

DATABASE.IN.PATH = snakemake@input[[1]]#'Metadata/monogenic.de-identified.metadata.RData'

## Binary matrix of visits by medications taken, with medications combined into major groups
MEDICATION.TYPES.OUT.PATH = snakemake@output[[1]]#'Medications/medications.types.rds'
## Binary matrix of visits by medications taken, with more resolved medications types
MEDICATION.NAMES.OUT.PATH = snakemake@output[[2]]#'Medications/medications.names.rds'

# Load Database
load(DATABASE.IN.PATH)

# Get data.frame of all medications
medications = monogenic.clinical[c('patient_id', 'visit_id', 'med_type','med_name', 'med_start_date', 'med_end_date')]
medications = unique(medications)

# Change patient_ids and visit_ids to 'P_' and 'V_' format
medications$patient_id = paste0('P', medications$patient_id)
medications$visit_id = paste0('V', medications$visit_id)
monogenic.all.assays$patient_id = paste0('P', monogenic.all.assays$patient_id)
monogenic.all.assays$visit_id = paste0('V', monogenic.all.assays$visit_id)

# Make a map associating visit ids with visit dates
visits = data.frame(visit_date = as.numeric(monogenic.all.assays$visit_date), visit_id = monogenic.all.assays$visit_id)
visits = unique(visits)

visit.dates = visits$visit_date
names(visit.dates) = visits$visit_id

# Associate each visit with its visit date in the medications df 
medications$visit_date = visit.dates[as.character(medications$visit_id)]

# Select only rows for which a medication is being taken during a visit date
select_rows = mapply(function(start, end, date) {
  (is.na(start) & is.na(end)) |
    (is.na(start) & date <= end) | 
    (is.na(end) & start <= date) | 
    (start <= date & date <= end)
}, medications$med_start_date, medications$med_end_date, medications$visit_date)

medications = medications[select_rows, ]

# Associate each visit_id with the corresponding patient id
visit.df = unique(medications[c('patient_id','visit_id')])
visit.map = visit.df$patient_id
names(visit.map) = visit.df$visit_id

# Instantiate a function to turn the long matrix into a binary wide matrix
binarize = function(med_col_name) {
  # Get the visit_ids
  visit_ids = factor(medications$visit_id)
  # Get the medications
  meds = factor(medications[[med_col_name]])
  # Determine if a patient was taking a drug during the visit date
  df.bin = table(visit_id = visit_ids, meds) > 0
  # Convert to a dataframe
  df.bin = as.data.frame(df.bin)
  # Add in the patient_ids and visit_ids
  df.bin$visit_id = rownames(df.bin)
  df.bin$patient_id = visit.map[df.bin$visit_id]
  # Rearrange to make meta data columns first
  df.bin = df.bin[c('patient_id', 'visit_id', levels(meds))]
  # Make all the column names proper names
  colnames(df.bin) = make.names(colnames(df.bin))

  return(df.bin)
}

# Do this at both the high and low medications levels
medications.types = binarize('med_type')
medications.names = binarize('med_name')

# Save results
saveRDS(medications.types, MEDICATION.TYPES.OUT.PATH)
saveRDS(medications.names, MEDICATION.NAMES.OUT.PATH)

R From line 5 of preprocessing/clean_medications.R

set.seed(179)

# Load libraries
library(Biobase)
library(MetaIntegrator)

# Set paths
## Meta-analysis metaintegrator study object
META.OBJECT.IN.PATH = snakemake@input[[1]]#'Reference/jamboree/data_analysis_ready/meta_studies.RDS'
## Gene surrogate signatures
SIGNATURES.IN.PATH = snakemake@input[[2]]#'Classification/transcriptional_surrogates/surrogate_signatures.RDS'

## Results of the enrichment analysis among meta-analysis gene effects
RESULTS.ENRICHMENTS.OUT.PATH = snakemake@output[[1]]#'Reference/jamboree/analysis_output/results/jamboree_enrichment_results.RDS'
## Gene-level meta-analysis results
META.ANALYSIS.RESULTS.OUT.PATH = snakemake@output[[2]]#'Reference/jamboree/analysis_output/results/jamboree_gene_level_results.RDS'

# Signature enrichments
## Get the object with the meta-analysis data
metaObject = readRDS(META.OBJECT.IN.PATH)
## Get the gene surrogate signatures we're interested in
signatures = readRDS(SIGNATURES.IN.PATH)

# Make sure the meta analysis object is in the correct format for meta integrator
stopifnot(checkDataObject(metaObject, "Meta", "Pre-Analysis"))

# Run the meta integrator meta analysis
results = runMetaAnalysis(metaObject)

# Get the statistics from the results
pools = results$metaAnalysis$pooledResults

# Initialize an empty list for the results for each signature
ress = list()

# For each signature
for(signature in names(signatures)) {

  ## Get all the genes in that signature
  genes = unname(unlist(signatures[[signature]]))

  ## Create a map between gene names and their meta analysis (absolute) effect sizes
  effects = abs(pools$effectSize)
  names(effects) = rownames(pools)

  ## See if the genes in the signature tend to have higher (absolute) effect sizes
  res = wilcox.test(effects[names(effects) %in% genes], effects[!names(effects) %in% genes], alternative = 'greater')

  ## Add the results to the results list
  ress[[signature]] = res
}

# Save the results
saveRDS(ress, RESULTS.ENRICHMENTS.OUT.PATH)
saveRDS(pools, META.ANALYSIS.RESULTS.OUT.PATH)

R Biobase MetaIntegrator From line 5 of analysis/results_enrichments.R

set.seed(79)

# Load libraries
library(Biobase)
library(MetaIntegrator)

# Set globals
## Meta integrator signature scores for our signatures of interest
META.SCORES.IN.PATH = snakemake@input[[1]]#'Reference/jamboree/analysis_output/meta_integrator_signature_scores.RDS'

## Jamboree meta-anlyses z-score-based meta result statistics
SCORES.RESULTS.OUT.PATH = snakemake@output[[1]]#'Reference/jamboree/analysis_output/results/jamboree_z_score_results.RDS'

# Signature significance results
## Get the signature scores meta-analysis object
metaObject = readRDS(META.SCORES.IN.PATH)

# Check that the metaObject is in the correct format
stopifnot(checkDataObject(metaObject, "Meta", "Pre-Analysis"))

# Run meta-analysis
results = runMetaAnalysis(metaObject)

# Subset to the desired result statistics
results = results$metaAnalysis

# Save results
saveRDS(results, SCORES.RESULTS.OUT.PATH)

R Biobase MetaIntegrator From line 6 of analysis/results_scores.R

CGPS.IN.PATH = snakemake@input[[1]]#'Reference/jamboree/processed/jamboree_cgps.RDS'
## Series matrix files
SERIES.IN.PATH = snakemake@input[[2]]#'Reference/jamboree/processed/series_matrix_list.rds'

## Cleaned cgps
CGPS.OUT.PATH = snakemake@output[[1]]#'Reference/jamboree/data_analysis_ready/cgps_clean.RDS'

# Read in data
cgps = readRDS(CGPS.IN.PATH)
series = readRDS(SERIES.IN.PATH)

# Remove SLE studies due to the low gene coverage in the OMiCC data
# for this disease
cgps = cgps[names(cgps) != 'SLE']

# Remove unwanted CGPs based on the CGP number for the following reaons:
# DM1_2: Follow up of other study from DM1_1, patients repeated
# RA_3: Largely overlaps with other data already being used in study GSE13501-GPL570
# RA_5: Control group has psoriasis
# RA_10: Largely overlaps with other data already being used in study GSE13501-GPL570
# RA_12: Largely overlaps with other data already being used in study GSE13501-GPL570
# RA_13: Largely overlaps with other data already being used in study GSE13501-GPL570
studies.to.remove = list(DM1 = c(2),
                         MS = c(NULL),
                         RA = c(3, 5, 10, 12, 13),
                         sarcoid = c(NULL))

studies.to.remove = sapply(studies.to.remove, function(study) {paste0('CGP_',study)})

# For each disease
for(disease in names(studies.to.remove)) {
  # For each study in the studies to remove
  studies = studies.to.remove[[disease]]
  cgps.subset = cgps[[disease]]
  for(study in studies) {
    # Remove those studies 
    cgps.subset = cgps.subset[setdiff(names(cgps.subset), studies)]
  }
  # Resave the CGPs for that disease without those studies
  cgps[[disease]] = cgps.subset
}

## Join cgps by study ID, and rename them by study ID

# Copy the CGPs
cgps.new = cgps

# For each disease
for(disease in names(cgps)) {
  # Get the CGPs for that disease
  cgps.subset = cgps[[disease]]
  # Instantiate a list for the study-based groups
  cgps.subset.new = list()
  # For each CGP in the CGPs
  for(cgp in cgps.subset) {
    # Get the study id associated with the cgp
    study.number = cgp$study.info$study
    study.platform = cgp$study.info$platform
    study.id = paste0(study.number,'.',study.platform)

    # If the study is not yet in the new cgps list, add it with empty cases and controls
    if(! study.id %in% names(cgps.subset.new)) {
      cgps.subset.new[[length(cgps.subset.new) + 1]] = list(cases = NULL, controls = NULL)
      names(cgps.subset.new)[[length(cgps.subset.new)]] = study.id
    }

    # Add in the cases and controls from this cgp to other cases and controls
    # from the same study
    cases = cgps.subset.new[[study.id]]$cases
    cgps.subset.new[[study.id]]$cases = unique(c(cases, cgp$case.names))
    controls = cgps.subset.new[[study.id]]$controls
    cgps.subset.new[[study.id]]$controls = unique(c(controls, cgp$control.names))
  }

  # Replace the old OMiCC CGPs with the new ones, organized by study
  cgps.new[[disease]] =  cgps.subset.new
}

# Rename the new cgps
cgps = cgps.new

# Here we edit the studies one at a time to deal with problems that arise

# GSE21942
## Remove technical replicate case samples
gsms.to.remove = c('GSM545843', 'GSM545845')

## We check to make sure we are removing the correct gsms from the series matrix file
# Get the series matrix for this study
check.mat = series$GSE21942.GPL570
# Subset the series matrix to the GSMs have on the study
check.mat = check.mat[check.mat$geo_accession %in% unlist(cgps$MS$GSE21942.GPL570),]
# Get the gsms with 'technical replicate 1' in them in favor of the second tech rep
check.gsms = check.mat$geo_accession[grepl('technical replicate 1', check.mat$title)]
# Check to make sure we removed the correct gsms
stopifnot(all(sort(gsms.to.remove) == sort(check.gsms)))

## We remove these samples
cgps$MS$GSE21942.GPL570$cases = setdiff(cgps$MS$GSE21942.GPL570$cases, gsms.to.remove)
cgps$MS$GSE21942.GPL570$controls = setdiff(cgps$MS$GSE21942.GPL570$controls, gsms.to.remove)

# GSE30210
## We take the last sample from each patient in this longitudinal study
case.gsms.to.keep = c('GSM747681', 'GSM747692', 'GSM747707', 'GSM747725', 'GSM747740', 'GSM747758', 'GSM747766',
                      'GSM747785', 'GSM747800', 'GSM747812', 'GSM747828', 'GSM747841', 'GSM747849', 'GSM747863',
                      'GSM747876', 'GSM747890', 'GSM747903', 'GSM747918')
control.gsms.to.keep = c('GSM747686', 'GSM747695', 'GSM747714', 'GSM747718', 'GSM747732', 'GSM747747', 'GSM747752',
                         'GSM747762', 'GSM747773', 'GSM747793', 'GSM747806', 'GSM747820', 'GSM747835', 'GSM747844',
                         'GSM747854', 'GSM747858','GSM747868', 'GSM747881', 'GSM747899', 'GSM747909', 'GSM747913',
                         'GSM747921')

## We check to make sure we are removing the correct gsms from the series matrix file
# Get the series matrix file
check.mat = series$GSE30210.GPL6947
# Subset to the GSMs we have on the study
check.mat = check.mat[check.mat$geo_accession %in% unlist(cgps$DM1$GSE30210.GPL6947),]
# We remove the time point of the sample name
titles = check.mat$title
titles = sapply(strsplit(titles,'_'), function(x) {x[[1]]})
check.mat$title = titles
# We reverse the order so later samples come first
check.mat = check.mat[rev(1:nrow(check.mat)), ]
# And get the gsms of the non-duplicated names
check.gsms = check.mat$geo_accession[!duplicated(check.mat$title)]
# We check these are equivalent to the gsms we plan to keep
stopifnot(all(sort(c(control.gsms.to.keep, case.gsms.to.keep)) == sort(check.gsms)))

## We keep only these samples
cgps$DM1$GSE30210.GPL6947$cases = case.gsms.to.keep
cgps$DM1$GSE30210.GPL6947$controls = control.gsms.to.keep

# GSE8650
## We remove biological/technical replicates (the last sample is kept for each patient) and patients without symptoms
gsms.to.remove = c('GSM214382', 'GSM214388', 'GSM214390', 'GSM214394', 'GSM214400', 'GSM214406', 'GSM214414',
                   'GSM214416', 'GSM214426', 'GSM214428', 'GSM214442', 'GSM214462', 'GSM214474', 'GSM214484',
                   'GSM214398', 'GSM214420', 'GSM214422', 'GSM214436', 'GSM214438', 'GSM214446', 'GSM214448',
                   'GSM214454', 'GSM214456', 'GSM214458', 'GSM214460', 'GSM214478')

## We check to make sure we are removing the correct gsms from the series matrix file
# We get the series matrix
check.mat = series$GSE8650.GPL96
# We subset it to the GSMs we have from this study
check.mat = check.mat[check.mat$geo_accession %in% unlist(cgps$RA$GSE8650.GPL96),]
# We get the GSMs corresponding to subjects without symptoms
check.gsms.1 = check.mat$geo_accession[check.mat$characteristics_ch1.4 == "Symptoms:  None"]
# We replace the titles with the patient IDs
titles = check.mat$title
titles = sapply(strsplit(titles,'_'), function(x) {x[[3]]})
titles = sapply(strsplit(titles,' '), function(x) {x[[1]]})
titles = tolower(titles)
check.mat$title = titles
# We reverse the order of the matrix to put the last sample for each patient first
check.mat = check.mat[rev(1:nrow(check.mat)), ]
# We get all but the first samples for each patient
check.gsms.2 = check.mat$geo_accession[duplicated(check.mat$title)]
# We put together all the GSMs we've found
check.gsms = unique(c(check.gsms.1, check.gsms.2))
# And we check these are the same as the ones we wish to remove
stopifnot(all(sort(gsms.to.remove) == sort(check.gsms)))

## We remove these samples
cgps$RA$GSE8650.GPL96$cases = setdiff(cgps$RA$GSE8650.GPL96$cases, gsms.to.remove)
cgps$RA$GSE8650.GPL96$controls= setdiff(cgps$RA$GSE8650.GPL96$controls, gsms.to.remove)

## We remove 2 samples that were found to have unreliable diagnoses in the supplemental data from the
## original publication (PMID: 17724127).
## These were JIA patients without diagnosis confirmation upon follow up.
## See supplementary table S1 in original paper for details.
extra.gsms.to.remove = c('GSM214490', 'GSM214492')
cgps$RA$GSE8650.GPL96$cases = setdiff(cgps$RA$GSE8650.GPL96$cases, extra.gsms.to.remove)
cgps$RA$GSE8650.GPL96$controls = setdiff(cgps$RA$GSE8650.GPL96$controls, extra.gsms.to.remove)

# GSE15645
## Remove patients with clinical remission
gsms.to.remove = c('GSM391602', 'GSM391603', 'GSM391604', 'GSM391605', 'GSM391606',
                   'GSM391607', 'GSM391608', 'GSM391609', 'GSM391610', 'GSM391611',
                   'GSM391612', 'GSM391613', 'GSM391614', 'GSM391615', 'GSM391616')

## Check to make sure we are removing the correct patients
# Get series matrix
check.mat = series$GSE15645.GPL570
# Subset to the gsms we have
check.mat = check.mat[check.mat$geo_accession %in% unlist(cgps$RA$GSE15645.GPL570),]
# Find all samples with CR in the title
titles = check.mat$title
# 'CR' stands for clinical remission and 'CRM' stands for clinical remission with medication
check.gsms = check.mat$geo_accession[grepl('_CR_', titles) | grepl('_CRM_', titles)]

# Ensure we are removing the right samples
stopifnot(all(sort(gsms.to.remove) == sort(check.gsms)))

## We remove these patients
cgps$RA$GSE15645.GPL570$cases = setdiff(cgps$RA$GSE15645.GPL570$cases, gsms.to.remove)
cgps$RA$GSE15645.GPL570$controls = setdiff(cgps$RA$GSE15645.GPL570$controls, gsms.to.remove)

# GSE18781
## Case and control GSMs were flipped here (in OMiCC)
controls = cgps$sarcoid$GSE18781.GPL570$cases
cases = cgps$sarcoid$GSE18781.GPL570$controls

cgps$sarcoid$GSE18781.GPL570$cases = cases
cgps$sarcoid$GSE18781.GPL570$controls = controls

# GSE42834
## Remove patients with non-active sarcoid
gsms.to.remove = c('GSM1050754', 'GSM1050759', 'GSM1050762', 'GSM1050763', 'GSM1050766', 'GSM1050774',
                   'GSM1050780', 'GSM1050783', 'GSM1050789', 'GSM1050793', 'GSM1050797', 'GSM1050816',
                   'GSM1050843', 'GSM1050864', 'GSM1050931', 'GSM1050933', 'GSM1050949', 'GSM1050969',
                   'GSM1050973', 'GSM1050975', 'GSM1050976', 'GSM1050977')

## Check to make sure we are removing the correct patients
check.mat = series$GSE42834.GPL10558
check.mat = check.mat[check.mat$geo_accession %in% unlist(cgps$sarcoid$GSE42834.GPL10558), ]
check.gsms = check.mat[check.mat$characteristics_ch1.2 == 'disease state: Non-active sarcoidosis', 'geo_accession']
stopifnot(sort(check.gsms) == sort(gsms.to.remove))

## Remove these patients
cgps$sarcoid$GSE42834.GPL10558$cases = setdiff(cgps$sarcoid$GSE42834.GPL10558$cases, gsms.to.remove)
cgps$sarcoid$GSE42834.GPL10558$controls = setdiff(cgps$sarcoid$GSE42834.GPL10558$controls, gsms.to.remove)

# Check that no GSMs are repeated
gsms = unname(unlist(cgps, recursive = T))
stopifnot(max(table(gsms)) == 1)

# Save the results
saveRDS(cgps, CGPS.OUT.PATH)

R From line 14 of preprocessing/clean_cgps.R

library(MetaIntegrator)
library(preprocessCore)

# Set globals
## Compiled jamboree data
DATA.IN.PATH = snakemake@input[[1]]#'Reference/jamboree/processed/jamboree_data.RDS'
## Clean cgps info
CGPS.IN.PATH = snakemake@input[[2]]#'Reference/jamboree/data_analysis_ready/cgps_clean.RDS'

## Cleaned and compiled meta-integrator object
META.OUT.PATH = snakemake@output[[1]]#'Reference/jamboree/data_analysis_ready/meta_studies.RDS'

# Load data
data.sets = readRDS(DATA.IN.PATH)
cgps = readRDS(CGPS.IN.PATH)

# Remove SLE as we do not plan to use these studies due to the small number of covered genes
data.sets = data.sets[names(data.sets) != 'SLE']

# Change data frames to matrices
data.sets = lapply(data.sets, as.matrix)

# Get the genes each disease's data set has in common
genes = Reduce(intersect, lapply(data.sets, rownames))

# Put together metaObject
## Create an empty meta object
metaObj = list()
## For each disease
for(disease in names(data.sets)) {
  ## Get all the studies in this diseass
  studies = names(cgps[[disease]])
  ## Get the set of data corresponding to this disease
  X = data.sets[[disease]]
  ## For each study
  for(study in studies) {
    ## Get the cases and controls associated with this study
    cgp = cgps[[disease]][[study]]
    case.gsms = cgp$cases
    control.gsms = cgp$controls
    gsms = c(case.gsms, control.gsms)
    ## Create an empty object to hold the data
    dataObj = list()

    ## Create a data frame with the gsms for the pheno slot in the dataObj
    dataObj$pheno = data.frame(gsms = gsms)
    rownames(dataObj$pheno) = gsms

    ## Set the classes associated with each case to be 1 and 
    ## controls to be 0
    dataObj$class = ifelse(gsms %in% case.gsms, 1, 0)
    ## Name the classes with the gsms
    names(dataObj$class) = gsms

    ## Set the name associated with the study in the data object to be the study name
    dataObj$formattedName = study

    ## Get the data for the desired genes and gsms in this study
    expr = X[genes, gsms]
    ## Remove any subjects with NAs
    expr = expr[rowSums(is.na(expr))==0,]
    ## Normalize quantiles within the study, as if some studies were already quantile normalized,
    ## all of them should be
    expr.normalized = normalize.quantiles(expr)
    ## Ensure all values are positive
    if(min(expr.normalized) <= 0) {
      expr.normalized = expr.normalized - min(expr.normalized) + 1
    }
    ## Name the normalized expression matrix the same as the original
    colnames(expr.normalized) = colnames(expr)
    rownames(expr.normalized) = rownames(expr)
    ## Rename the normalized expression matrix
    expr = expr.normalized

    ## Add the data to the dataObj
    dataObj$expr = expr
    ## Add the genes to the data obj
    dataObj$keys = rownames(expr)
    ## Check that the dataObj is in an acceptable form for metaIntegrator
    stopifnot(checkDataObject(dataObj,"Dataset"))
    ## Add this dataObject to the list of metaObjects
    metaObj[[study]] = dataObj
  }
}

# Wrap the metaObj in a list
metaObj = list(originalData = metaObj)
# Check the the metaObj is in an acceptable form for metaIntegrator
stopifnot(checkDataObject(metaObj, "Meta", "Pre-Analysis"))

# Save the results
saveRDS(metaObj, META.OUT.PATH)

R preprocessCore MetaIntegrator From line 4 of preprocessing/clean_jamboree_data.R

CGP.IN.PATHS = list(
  DM1 = snakemake@input[[1]],#'Reference/jamboree/raw/jam_human_DM1.txt',
  MS = snakemake@input[[2]],#'Reference/jamboree/raw/jam_human_MS.txt',
  RA = snakemake@input[[3]],#'Reference/jamboree/raw/jam_human_RA.txt',
  sarcoid = snakemake@input[[4]],#'Reference/jamboree/raw/jam_human_sarcoid.txt',
  SLE = snakemake@input[[5]]#'Reference/jamboree/raw/jam_human_SLE.txt'
)

## Compiled jamboree cgps
CGP.OUT.PATH = snakemake@output[[1]]#'Reference/jamboree/processed/jamboree_cgps.RDS'

# Helper function for getting CGP info
get_info = function(indices, keys, values) {
  ## Parse a set of keys value pairs, limited only to
  ## the indices given by 'indices'

  ## Get all keys of the proper indices
  keys = keys[indices]
  ## Get all values of the proper indices
  values = values[indices]
  ## Extract the key name: this is the last entry after a '_'
  keys = sapply(strsplit(keys,'_'), function(x) {x[[length(x)]]})

  ## Create a key-value pair map
  values = as.list(values)
  names(values) = make.names(keys)
  return(values)
}

# Instantiate a function to get info a associated with a CGP
extract_CGP = function(cgp.meta, cgp.number) {
  ## Collects all information on a certain cgp number from metadata
  ## Get the header corresponding to this CGP number
  header = paste0("^!CGP_", cgp.number, "_")

  ## Get all lines in the meta data corresponding to this CGP
  cgp.meta = grep(header, cgp.meta, value=T)
  ## Remove the CGP header from each line
  cgp.meta = gsub(header, "", cgp.meta)
  ## Split the rest of the line into key value pairs
  cgp.meta = strsplit(cgp.meta, '\t')

  ## Get all of the keys
  keys = sapply(cgp.meta, function(x) {x[[1]]})
  ## Get all of the associated values
  values = sapply(cgp.meta, function(x) {x[[2]]})

  ## Get all lines that are general metadata for the CGPs
  ## These are those in which the keys do not contain the keyword 'condition'
  index = !grepl('condition', keys)
  ## Parse these lines
  study.info = get_info(index, keys, values)

  ## Get all lines that are metadata for the cases
  ## These are those in which the keys contain the keyword 'condition1_sample_'
  index = grepl('condition1_sample_', keys)
  ## Parse these lines
  case.info = get_info(index, keys, values)

  ## Get all lines that are metadata for the controls
  ## These are those in which the keys contain the keyword 'condition1_sample_'
  index = grepl('condition2_sample_', keys)
  ## Parse these lines
  control.info = get_info(index, keys, values)

  ## Get all GSMs for the cases
  ## These correspond to the value in which the key is condition1_sample
  case.names = values[keys == 'condition1_sample']
  ## Get all GSMs for the controls
  ## These correspond to the value in which the key is condition2_sample
  control.names = values[keys == 'condition2_sample']

  ## Put all the metadata into a list
  cgp = list(study.info = study.info,
             case.info = case.info,
             control.info = control.info,
             case.names = case.names,
             control.names = control.names)

  return(cgp)
}

# Instantiate function to extract CGP info from a file path
extract_CGPs = function(file.path) {
  ## Search for all lines containing '!' as the first character;
  ## these are the lines with CGP meta data.
  cgp.meta = grep("^!", readLines(file.path), value=T)
  ## Get the number of CGPs in the file, given by the second entry in the first line of metadata
  num.cgp = as.numeric(unlist(strsplit(cgp.meta[1],"\t"))[2])
  ## For each CGP, extract all information on that cgp from the metadata. Note that the
  ## CGP numbers start at 0
  cgps = lapply(0:(num.cgp-1), function(cgp.number) {extract_CGP(cgp.meta, cgp.number)})
  ## Format the CGP names, and make them start at 1 rather than 0
  names(cgps) = paste0('CGP_', 1:num.cgp)
  return(cgps)
}

# Extract CGP info
cgps = lapply(CGP.IN.PATHS, extract_CGPs)

# Save results
saveRDS(cgps, CGP.OUT.PATH)

R From line 6 of preprocessing/get_cgp_info.R

DATA.IN.PATHS = list(
  DM1 = snakemake@input[[1]],#'Reference/jamboree/raw/jam_human_DM1.txt',
  MS = snakemake@input[[2]],#'Reference/jamboree/raw/jam_human_MS.txt',
  RA = snakemake@input[[3]],#'Reference/jamboree/raw/jam_human_RA.txt',
  sarcoid = snakemake@input[[4]],#'Reference/jamboree/raw/jam_human_sarcoid.txt',
  SLE = snakemake@input[[5]]#'Reference/jamboree/raw/jam_human_SLE.txt'
)

## Compiled jamboree data
DATA.OUT.PATH = snakemake@output[[1]]#'Reference/jamboree/processed/jamboree_data.RDS'

## Instantiate a function to extract data from each file path and convert to a matrix
get_data = function(file.path) {
  data = read.csv(file.path, header = T, sep = "\t", comment.char = "!", row.names = 1)
  data = as.matrix(data)
}

## Apply this function over file paths
data = lapply(DATA.IN.PATHS, get_data)

## Save results
saveRDS(data, DATA.OUT.PATH)

R From line 6 of preprocessing/get_jamboree_data.R

set.seed(1119)

# Load libraries and source utilities
library(MetaIntegrator)
source('scripts/util/Signatures/get_signature_scores.R')

# Set globals
## Cleaned cgps
CGPS.IN.PATH = snakemake@input[[1]]#'Reference/jamboree/data_analysis_ready/cgps_clean.RDS'
## Meta-analysis object
META.OBJ.IN.PATH = snakemake@input[[2]]#'Reference/jamboree/data_analysis_ready/meta_studies.RDS'
## Gene surrogate signatures for the features we wish to test
SIGNATURES.IN.PATH = snakemake@input[[3]]#'Classification/transcriptional_surrogates/surrogate_signatures.RDS'

## Meta-analysis study gene signature scores
SCORES.OUT.PATH = snakemake@output[[1]]#'Reference/jamboree/analysis_output/meta_integrator_signature_scores.RDS'
## Meta integrator gene signature scores for each disease subset
SUBSET.SCORES.OUT.PATH = snakemake@output[[2]]#'Reference/jamboree/analysis_output/meta_integrator_subset_signature_scores.RDS'

# Load data
cgps = readRDS(CGPS.IN.PATH)
metaObj = readRDS(META.OBJ.IN.PATH)
signatures = readRDS(SIGNATURES.IN.PATH)

# Instantiate a function to turn a meta object into a meta score object
get_scores = function(metaObj, signatures) {
  ## Extract the data objects from the metaObj
  dataObjs = metaObj$originalData
  ## For each set of data object
  scores = lapply(dataObjs, function(dataObj) {
    ## Get the signature scores for each signature and turn it into a
    ## new expression matrix
    expr = t(dataObj$expr)
    expr = util.get_signature_scores(expr, signatures) 
    ## Replace the data object with the new expression matrix
    dataObj$expr = t(expr)
    ## Replace the names of the original features with the names of the signatures
    dataObj$keys = rownames(dataObj$expr)
    ## Check that this data object has the correct format
    stopifnot(checkDataObject(dataObj, "Dataset"))
    return(dataObj)
  })
  ## Wrap the scores into a meta object
  metaObj = list(originalData = scores)
  ## Check that this meta object has the correct format
  stopifnot(checkDataObject(metaObj, "Meta", "Pre-Analysis"))
  return(metaObj)
}

# We get the signature scores across diseases
metaObj.scores = get_scores(metaObj, signatures)

# We get the signature scores in each disease
diseases = names(cgps)
## For each disease
metaObj.subset.scores = lapply(diseases, function(disease) {
  ## Find the studies for a single disease
  studies = names(cgps[[disease]])
  ## Subset the meta object to these studies
  metaObj.subset = list(originalData = metaObj$originalData[studies])
  ## Get the signature scores for these studies
  scores = get_scores(metaObj.subset, signatures)
})

names(metaObj.subset.scores) = diseases

# We save the results
saveRDS(metaObj.scores, SCORES.OUT.PATH)
saveRDS(metaObj.subset.scores, SUBSET.SCORES.OUT.PATH)

R MetaIntegrator From line 8 of preprocessing/get_jamboree_scores.R

GEO.IN.PATHS = snakemake@input[1:(length(snakemake@input)-1)] #Reference/jamboree/raw/series_matrices/
## The comparison groups pairs from the jamboree
CGPS.IN.PATH = snakemake@input[[length(snakemake@input)]]#'Reference/jamboree/processed/jamboree_cgps.RDS'

## The series matrices
SERIES.OUT.PATH = snakemake@output[[1]]#'Reference/jamboree/processed/series_matrix_list.rds'

# Load data
cgps <- readRDS(CGPS.IN.PATH)

# Initialize an empty list of series matrices
series.list <- list()

# For each disease
for(nm1 in names(cgps)) {
  # For each CGP under that disease
  for(nm2 in names(cgps[[nm1]])) {
    # Get the GEO study id associated with the CGP
    geo.id <- cgps[[nm1]][[nm2]]$study.info$study
    # Get the GEO platform associated with the CGP
    geo.platform <- cgps[[nm1]][[nm2]]$study.info$platform

    # Get the path for the series matrix from the snakemake inputs
    in.name <- paste(geo.id, geo.platform, 'series_matrix.txt', sep = '_')
    in.path <- grep(paste0('*\\/', in.name), GEO.IN.PATHS, value = T)

    # Read in the series matrix file
    series.matrix = read.table(in.path, header = TRUE, quote = '', comment.char = '', sep = '\t',
                               stringsAsFactors = FALSE)

    # Add the series matrix to the list
    series.name = paste(geo.id, geo.platform, sep = '.')
    series.list[[series.name]] <- series.matrix
  }
}

saveRDS(series.list, file = SERIES.OUT.PATH)

R Snakemake From line 5 of preprocessing/get_series.R

library(Biobase)
library(BiocGenerics)

#Config ------------------------------------------------
#Input
#Path to input cel files
ESET.IN.PATH <- snakemake@input[["eset"]]#"Data/Microarray/raw/eset_rma_out.rds"
META.DATA.PATH <- snakemake@input[["meta"]]#"Metadata/monogenic.de-identified.metadata.RData"

#Output
esetOut <- snakemake@output[[1]]#"Data/Microarray/raw/eset_rma_with_pData.rds"

#load data ----------------------------------------------
eset <- readRDS(ESET.IN.PATH)
load(META.DATA.PATH)

#Add pData ----------------------------------------------
#There was one patient removed from the protocol that no longer is in monogenic.microarray
eset <- eset[, sampleNames(eset) %in% monogenic.microarray$array_filename]
pdat <- monogenic.microarray[match(sampleNames(eset), monogenic.microarray$array_filename),]
stopifnot(identical(sampleNames(eset), pdat$array_filename))

###Add V and P prefixes to visit ID's
pdat[["patient_id"]] <- paste0("P", pdat[["patient_id"]])
pdat[["visit_id"]] <- paste0("V", pdat[["visit_id"]])

pData(eset) <- pdat

# Save --------------------------------------------------
saveRDS(eset, esetOut)

R Biobase BiocGenerics From line 7 of 0_rma/add_pdata.R

library(Biobase)
library(BiocGenerics)
library(pd.hugene.1.0.st.v1)
library(oligo)

#Config ------------------------------------------------
#Input
#Path to input cel files
celDir <- snakemake@input[["cel_dir"]]#"../Monogenic_microarray/Monogenic_Microarray_CEL_File"

#Output
rawOut <- snakemake@output[["raw"]]#"Data/Microarray/raw/raw_featureset.rds"
esetOut <- snakemake@output[["eset"]]#"Data/Microarray/raw/eset_rma_out.rds"

#load data ----------------------------------------------
print("loading Cel files")
celFiles <- list.celfiles(celDir, full.names=TRUE)
rawData <- read.celfiles(celFiles)
print("All files read")

#save raw data ------------------------------------------
saveRDS(rawData, rawOut)

#Perform rma---------------------------------------------
eset <- rma(rawData, target="core")
print("RMA done")

#Save rma expressionset ---------------------------------
saveRDS(eset, esetOut)
print(0)

R pd.hugene.1.0.st.v1 Biobase oligo BiocGenerics From line 1 of 0_rma/rma.R

library(plyr) #need for the pick probeset function
library(dplyr)
library(tidyr)
library(GEOquery)

#sources the function that will find probeset most correlated with PC1 of all probesets for each gene
source("scripts/util/Processing/pick_probeset.R")

# Config ------------------------------------------------------
#input files
#path to microarray data
ARRAY.IN.PATH <- snakemake@input[["eset"]]#"Data/Microarray/raw/eset_rma_with_pData.rds"
ANNOTATION.IN.PATH <- snakemake@input[["probe_anno"]]#"Data/Microarray/probeset/pre_downloaded_ann/probe_annotations_full.csv"

#outfiles
PROBEMAP.OUT.PATH <- snakemake@output[["probemap"]]#"Data/Microarray/probeset/output/probe_annotations.txt" #saves probemap file that is input to the pick probeset function
PICKED.PROBES.OUT.PATH <- snakemake@output[["picked_probes"]]#"Data/Microarray/probeset/output/picked_probes.txt"

#if(!dir.exists(dirname(PICKED.PROBES.OUT.PATH))) dir.create(dirname(PICKED.PROBES.OUT.PATH))

#load data ------------------------------------------------------
eset <- readRDS(ARRAY.IN.PATH) 
annotation <- read.csv(ANNOTATION.IN.PATH, header = TRUE)

#write the probemap file that maps genes to probes
out <- 
  annotation %>% 
  select(ID, Gene.symbol) %>% # keeps only ID and gene columns
  rename(gene = Gene.symbol) %>% # rename column
  mutate(ID = as.character(ID), gene = as.character(gene)) %>%  # Make sure they are character vectors
  filter(!grepl("///",gene) & gene != "")              # Filter out genes that map to more than one ID

write.table(out, file = PROBEMAP.OUT.PATH, sep="\t", quote = F, col.names = T, row.names = F)

# Call pick probeset function. Will write the picked probes to file

pick.probeset(eset, PROBEMAP.OUT.PATH, PICKED.PROBES.OUT.PATH) # generates file.map.pc1

R dplyr tidyr plyr GEOquery From line 5 of 2_probeset/get_anno_pick_probes.R

source('scripts/util/Processing/averageRepeatSamples.R')
source('scripts/util/WGCNA/get_eigengene_scores.R')

# Set paths
## Microarray modules
MODULES.IN.PATH = snakemake@input[[1]]#'Data/Microarray/analysis_output/WGCNA/modules.rds'
## Microarray subject-level training eset
TRAINING.SET.MICROARRAY.ESET.IN.PATH = snakemake@input[[2]]#'Data/Microarray/data_analysis_ready/eset_batch_training_subject.rds'
## Microarray sample-level testing eset
TESTING.SET.MICROARRAY.ESET.IN.PATH = snakemake@input[[3]]#'Data/Microarray/data_analysis_ready/eset_batch_validation_sample.rds'

TESTING.SET.SAMPLE.LEVEL.SCORES.ESET.OUT.PATH = snakemake@output[[1]]#'Data/Microarray/analysis_output/WGCNA/scores_sample_level_testing.rds'
TESTING.SET.SUBJECT.LEVEL.SCORES.ESET.OUT.PATH = snakemake@output[[2]]#'Data/Microarray/analysis_output/WGCNA/scores_subject_level_testing.rds'

# Load data
modules = readRDS(MODULES.IN.PATH)
training.set.microarray.eset = readRDS(TRAINING.SET.MICROARRAY.ESET.IN.PATH)
testing.set.microarray.eset = readRDS(TESTING.SET.MICROARRAY.ESET.IN.PATH)

# Get the sample level module scores for the testing eset
testing.set.sample.level.scores.eset = get_eigengene_scores(training.set.microarray.eset, testing.set.microarray.eset, modules)

# Average over samples within a subject to get the subject level module scores for the testing eset
testing.set.subject.level.scores.eset = averageRepeatSamples(testing.set.sample.level.scores.eset)

# Save results
saveRDS(testing.set.sample.level.scores.eset, TESTING.SET.SAMPLE.LEVEL.SCORES.ESET.OUT.PATH)
saveRDS(testing.set.subject.level.scores.eset, TESTING.SET.SUBJECT.LEVEL.SCORES.ESET.OUT.PATH)

R From line 4 of 4_WGCNA/get_testing_set_microarray_module_scores.R

library(WGCNA)
library(Biobase)

## Set seed
set.seed(10798)

## Source wgcna function
source('scripts/util/WGCNA/runWGCNA.r')
source('scripts/util/Processing/averageRepeatSamples.R')
source('scripts/util/WGCNA/get_eigengene_scores.R')
source('scripts/util/Processing/removeOutlierPatients.R')

# Set GlobalVariables
## Clean sample-level somalogic data
SAMPLES.IN.PATH = snakemake@input[[1]]#'Data/Microarray/data_analysis_ready/eset_batch_training_sample.rds'

## The somalogic WGCNA feature to module map
MODULES.OUT.PATH = snakemake@output[[1]]#'Data/Microarray/analysis_output/WGCNA/modules.rds'
## The sample-level somalogic module scores
SCORES.SAMPLE.LEVEL.OUT.PATH = snakemake@output[[2]]#'Data/Microarray/analysis_output/WGCNA/array_sample_scores.rds'
## The subject-level somalogic module scores
SCORES.SUBJECT.LEVEL.OUT.PATH = snakemake@output[[3]]#'Data/Microarray/analysis_output/WGCNA/array_subject_scores.rds'
## The variances explained by PC1 of each module
VARIANCES.OUT.PATH = snakemake@output[[4]]#'Data/Microarray/analysis_output/WGCNA/array_subject_varexp.rds'
## WGCNA intermediate objects
INTERMEDIATES.OUT.PATH = snakemake@output[[5]]#'Data/Microarray/analysis_output/WGCNA/WGCNA_microarray_intermediates.rds'
## Outlier removal plots
OUTLIER.REMOVAL.PLOTS.OUT.PATH = snakemake@output[[6]]#'Paper_1_Figures/Supplemental_Figure_1/microarray_outlier_removal_for_wgcna.pdf'
## Diagnostic plots from WGCNA module creation
WGCNA.PLOTS.OUT.PATH = snakemake@output[[7]]#'Paper_1_Figures/Supplemental_Figure_1/microarray_wgcna_module_creation.pdf'

## Load data
microarray.samples = readRDS(SAMPLES.IN.PATH)

## Prevent WGCNA from operating with parallel
disableWGCNAThreads()

## Remove outlier samples (and plot results)
pdf(OUTLIER.REMOVAL.PLOTS.OUT.PATH)
microarray.samples.filtered = removeOutlierPatients(microarray.samples, cutHeight = 250)
dev.off()

## Calculate the subject level data without outliers
microarray.subjects = averageRepeatSamples(microarray.samples.filtered)

## Run wgcna function
modules = runWGCNA(microarray.subjects, OUTDIR, method = 'hybrid', pamStage = TRUE, 
                   pamRespectsDendro = FALSE, beta = 12, minModuleSize = 30, deepSplit = 2,
                   intermediate.results.path = INTERMEDIATES.OUT.PATH, 
                   diagnostic.plots.path = WGCNA.PLOTS.OUT.PATH)

## Get the scores associated with each sample for each module
scores.sample.level = get_eigengene_scores(microarray.subjects, microarray.samples, modules)

## Get the variances explained by PC1 of each module
variances = get_eigengene_variance_explained(microarray.subjects, modules)

## Average over repeat samples
scores.subject.level = averageRepeatSamples(scores.sample.level)

## Save modules and scores
saveRDS(modules, file = MODULES.OUT.PATH)
saveRDS(scores.sample.level, file = SCORES.SAMPLE.LEVEL.OUT.PATH)
saveRDS(scores.subject.level, file = SCORES.SUBJECT.LEVEL.OUT.PATH)
saveRDS(variances, file = VARIANCES.OUT.PATH)

R Biobase WGCNA From line 5 of 4_WGCNA/wgcna.R

library(tidyverse)
library(data.table)
library(zip)
library(openxlsx)

#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")

source("scripts/util/paper/abbrev_cond.R")

if(!exists("snakemake")){
  setwd("../..")
  source("scripts/util/paper/parse_snakemake.R")
  parse_snakemake("compile_supp_tables_excel")
}


paths_list <- list(
  s2_TM_members=snakemake@input[["tm_members"]],#"Paper_1_Figures/Figure_1_Tables/microarray_module_members.csv",
  s3_PM_members=snakemake@input[["pm_members"]],#"Paper_1_Figures/Figure_1_Tables/somalogic_module_members.csv",
  s4_TM_enrich=snakemake@input[["tm_enrich"]],#"Paper_1_Figures/Figure_1_Tables/microarray_module_gene_set_enrichments_table.txt",
  s5_PM_enrich_geneset=snakemake@input[["pm_enrich_genesets"]],#"Paper_1_Figures/Figure_1_Tables/somalogic_module_gene_set_enrichments_table.txt",
  s6_PM_enrich_tissue=snakemake@input[["pm_enrich_tissue"]],#"Paper_1_Figures/Figure_1_Tables/somalogic_module_tissues_set_enrichments_table.txt",
  s7_tbnk_DE=snakemake@input[["tbnk_de"]],#"Paper_1_Figures/Figure_2_Tables/cbc_and_tbnks_DE_results.txt",
  s8_PM_DE=snakemake@input[["pm_de"]],#"Paper_1_Figures/Figure_2_Tables/protein_modules_DE_results.txt",
  S9_TM_DE=snakemake@input[["tm_de"]],#"Paper_1_Figures/Figure_2_Tables/gene_modules_DE_results.txt",
  s10_P_feat_DE=snakemake@input[["p_feat_de"]],#"Paper_1_Figures/Figure_2_Tables/protein_features_DE_results.txt",
  s11_T_feat_DE=snakemake@input[["t_feat_de"]],#"Paper_1_Figures/Figure_2_Tables/gene_features_DE_results.txt",
  s12_Jive_PCs=snakemake@input[["jpcs"]],#"Paper_1_Figures/Figure_3_Tables/jive_pcs.csv",
  s13_JIVE_PC_cor_feat=snakemake@input[["jive_pc_feat_cor"]],
  s14_Jive_PC_enrich=snakemake@input[["jpc_enrich"]],#"Paper_1_Figures/Figure_3_Tables/jive_pc_enrichment.csv",
  s15_JIVE_PC_cor_mod_tbnk=snakemake@input[["jpc_cor_tbnk"]],
  s16_IHM_feat_gvi=snakemake@input[["ihm_feat_gvi"]],#"Paper_1_Figures/Figure_4_Tables/healthy_feature_gvi_table.txt",
  s17_IHM_scores=snakemake@input[["ihm_scores"]],#"Paper_1_Figures/Figure_4_Tables/hi_results_full_mod.csv",
  s18_meta_analysis_subjects=snakemake@input[["meta_analysis_n_subj"]],
  s19_sig_genes=snakemake@input[["sig_genes"]],#"Paper_1_Figures/Figure_4_Tables/surrogate_sig_genes.csv",
  s20_meta_analysis=snakemake@input[["meta_analysis"]],#"Paper_1_Figures/Figure_4_Tables/figure_4_meta_analysis_table.txt",
  s21_study_eff_size=snakemake@input[["study_eff_size"]],#"Paper_1_Figures/Figure_4_Tables/figure_4_meta_analysis_effsize_table.txt",
  s22_IHM_sig_prot=snakemake@input[["ihm_sig_prot"]],#"Paper_1_Figures/Figure_5_Tables/proteomic_surrogate_ihm.csv"
  s23_ihm_age_cxcl9_lm=snakemake@input[["ihm_age_cxcl9"]]
)

EXCEL.OUT.PATH <- snakemake@output[[1]]#"Paper_1_Figures/supp_tables.xlsx"

dat_list <- lapply(paths_list, fread, data.table = FALSE, nThread = 1)

anno_list <- list(
  s2_TM_members="Supplementary Table 2 : Transcriptomic Module (TM) gene membership",
  s3_PM_members="Supplementary Table 3 : Proteomic Module (PM) gene membership",
  s4_TM_enrich="Supplementary Table 4 : Transcriptomic Module (TM) gene set enrichment",
  s5_PM_enrich_geneset="Supplementary Table 5 : Proteomic Module (PM) gene set enrichment",
  s6_PM_enrich_tissue="Supplementary Table 6 : Proteomic Module (PM) tissue enrichment",
  s7_tbnk_DE="Supplementary Table 7 : CBC + TBNK feature differential abundance",
  s8_PM_DE="Supplementary Table 8 : Proteomic Module (PM) differential expression",
  S9_TM_DE="Supplementary Table 9 : Transcriptomic Module (TM) differential expression",
  s10_P_feat_DE="Supplementary Table 10 : Proteomic feature differential expression",
  s11_T_feat_DE="Supplementary Table 11 : Transcriptomic feature differential expression",
  s12_Jive_PCs="Supplementary Table 12 : JIVE Principal Component scores for each subject",
  s13_JIVE_PC_cor_feat="Supplementary Table 13 : JIVE Principal Component scores correlation with transcriptomic and proteomic features",
  s14_Jive_PC_enrich="Supplementary Table 14 : JIVE Principal Component Enrichment",
  s15_JIVE_PC_cor_mod_tbnk="Supplementary Table 15 : JIVE Principal Component scores correlation with modules and cell frequencies",
  s16_IHM_feat_gvi="Supplementary Table 16 : Classifier Global Variable Importance (GVI)",
  s17_IHM_scores="Supplementary Table 17 : Immune Health Metric (IHM) scores",
  s18_meta_analysis_subjects="Supplementary Table 18 : Studies included in meta-analysis and number of subjects",
  s19_sig_genes="Supplementary Table 19 : Transcriptomic surrogate signature genes used in meta-analysis",
  s20_meta_analysis="Supplementary Table 20 : Meta-analysis summary",
  s21_study_eff_size="Supplementary Table 21 : Meta-analysis within study effect sizes for the IHM surrogate signature",
  s22_IHM_sig_prot="Supplementary Table 22 : Proteomic Immune Health Metric (IHM) surrogate proteins",
  s23_ihm_age_cxcl9_lm="Supplementary Table 23 : Linear model results: IHM ~ age + cxcl9. In Monogenic and Baltimore data."
)

anno_list2 <- list(
  s2_TM_members="Additional Details: Modules were created with WGCNA R package. Both stable and unstable features were included. A feature's stability (variance explained by subject > .5) is shown. The number of features in each module are also shown.",
  s3_PM_members="Additional Details: Modules were created with WGCNA R package. Both stable and unstable features were included. A feature's stability (variance explained by subject > .5) is shown. The number of features in each module are also shown.",
  s4_TM_enrich="Additional Details: Enrichment determined with Fisher's Exact Test. P values adjusted with Benjamini-Hochberg procedure.",
  s5_PM_enrich_geneset="Additional Details: Enrichment determined with Fisher's Exact Test. P values adjusted with Benjamini-Hochberg procedure.",
  s6_PM_enrich_tissue="Additional Details: '25 Data from the Human Protein Atlas in tab-separated format', proteinatlas.tsv, was downloaded from https://www.proteinatlas.org/about/download. Genes were grouped into 3 categories per tissue based on characterization by Human Protein Atlas. strict = 'Tissue enriched', medium = 'Tissue enriched' + 'Tissue enhanced', general = 'Tissue enhanced + 'Tissue enriched' + 'Group enriched'. See source column for description of categories were included for that particular set.",
  s7_tbnk_DE="Additional Details: Linear models fit with Limma R package, comparing each condition to healthy controls while controlling for age and sex.",
  s8_PM_DE="Additional Details: Linear models fit with Limma R package, comparing each condition to healthy controls while controlling for age and sex.",
  S9_TM_DE="Additional Details: Linear models fit with Limma R package, comparing each condition to healthy controls while controlling for age and sex.",
  s10_P_feat_DE="Additional Details: Linear models fit with Limma R package, comparing each condition to healthy controls while controlling for age and sex.",
  s11_T_feat_DE="Additional Details: Linear models fit with Limma R package, comparing each condition to healthy controls while controlling for age and sex.",
  s12_Jive_PCs="Additional Details: Data were averaged for each subject and the stable transcriptomic and serum protein features were selected and used to to compute JIVE PC scores.",
  s13_JIVE_PC_cor_feat="Additional Details: The JIVE PC Scores for every subject were tested for correlation with all modules and cell population frequenceis",
  s14_Jive_PC_enrich="Additional Details: The correlation of each gene in the whole blood transcriptome data was computed. Gene set enrichment was then performed with the CameraPR function from the Limma R package",
  s15_JIVE_PC_cor_mod_tbnk="Additional Details: The JIVE PC Scores for every subject were tested for correlation with all modules and cell population frequenceis.",
  s16_IHM_feat_gvi="Additional Details: The Global Variable Importance is a measure of how useful a particular feature was to the classifier. P values were determined through a permutation test",
  s17_IHM_scores="Additional Details: IHM scores are the leave one out cross-validation scores predicting Healthy vs. Disease for each subject using the TMs, PMs, cell frequencies and grey module proteins. A higher score indicates a that this subject is more similar to the healthy subjects according to the classifier",
  s18_meta_analysis_subjects="Additional Details: Comparison Group Pairs described in Lau et al. F1000Research 5 (2016) were combined for each study and further curated as described in 'notes' column.",
  s19_sig_genes="Additional Details: Surrogate transcriptomic signatures of predictive features from IHM classifier were derived by searching for transcriptomic features highly correlated with the feature of interest (e.g. PM or cell population frequency)",
  s20_meta_analysis="Additional Details: Meta-analysis of transcriptomic surrogate signatures in autoimmunity datasets was performed with MetaIntegrator R package.",
  s21_study_eff_size="Additional Details: Meta-analysis of transcriptomic surrogate signature of IHM in autoimmunity datasets was performed with MetaIntegrator R package.",
  s22_IHM_sig_prot="Additional Details: Surrogate protein signature of IHM was derived by searching for protein features correlated with IHM",
  s23_ihm_age_cxcl9_lm="Additional Details: For the monogenic data, the IHM from the classifier was used directly in the linear model. For the Baltimore Aging cohort, the IHM proteomic surrogate was used."
)


wb <- createWorkbook()
## Add a worksheet
for(nm in names(dat_list)){
  dat <- dat_list[[nm]]
  if("condition" %in% colnames(dat)){
    dat$condition <- abbrev_cond(dat$condition)
  }
  if("adj.P.Val" %in% colnames(dat)){
    dat <- dat %>% rename(Adjusted.Pvalue = adj.P.Val)
  }
  if("p.adj" %in% colnames(dat)){
    dat <- dat %>% rename(Adjusted.Pvalue = p.adj)
  }
  if("AdjP" %in% colnames(dat)){
    dat <- dat %>% rename(Adjusted.Pvalue = AdjP)
  }
  if("p" %in% colnames(dat)){
    dat <- dat %>% rename(P.Value = p)
  }
  if("Pval" %in% colnames(dat)){
    dat <- dat %>% rename(P.Value = Pval)
  }
  colnames(dat) <- gsub("FDR", "AdjustedPVal", colnames(dat))

  addWorksheet(wb, nm)
  writeData(wb, nm, anno_list[[nm]])
  writeData(wb, nm, anno_list2[[nm]], startRow  = 3)
  writeData(wb, nm, dat, startRow = 5)
}



#Add featcounts for the modules for stable vs unstable
TM_FEAT_COUNT_IN_PATH <- snakemake@input[["tm_feat_counts"]]
PM_FEAT_COUNT_IN_PATH <- snakemake@input[["pm_feat_counts"]]
tm_feat_count <- fread(TM_FEAT_COUNT_IN_PATH, data.table = FALSE, nThread = 1)
pm_feat_count <- fread(PM_FEAT_COUNT_IN_PATH, data.table = FALSE, nThread = 1)
writeData(wb, 1, tm_feat_count, startRow = 5, startCol = ncol(dat_list[[1]]) + 3)
writeData(wb, 2, pm_feat_count, startRow = 5, startCol = ncol(dat_list[[2]]) + 3)

saveWorkbook(wb, file = EXCEL.OUT.PATH, overwrite = TRUE)

R Snakemake tidyverse data.table openxlsx zip From line 10 of Paper_Figures/compile_all_supp_tables_excel.R

library(ggplot2)
library(ggrepel)
library(reshape2)
library(Biobase)
library(pheatmap)

# Load utility functons
source('scripts/util/Plotting/feature_and_module_heatmaps.R')
source('scripts/util/Processing/averageRepeatSamples.R')

# Set paths
SOMALOGIC.ESET.IN.PATH = snakemake@input[[1]]#'Data/Somalogic/data_analysis_ready/analysis_ready_subject_level_training_somalogic.rds'
SOMALOGIC.VP.IN.PATH = snakemake@input[[2]]#'Data/Somalogic/analysis_output/stability/somalogic_features_standard_vp.rds'
SOMALOGIC.MODULES.IN.PATH = snakemake@input[[3]]#'Data/Somalogic/analysis_output/wgcna_results/modules.rds'
MICROARRAY.ESET.IN.PATH = snakemake@input[[4]]#'Data/Microarray/data_analysis_ready/eset_batch_training_subject.rds'
MICROARRAY.VP.IN.PATH = snakemake@input[[5]]#'Data/Microarray/analysis_output/stability/microarray_features_standard_vp.rds'
MICROARRAY.MODULES.IN.PATH = snakemake@input[[6]]#'Data/Microarray/analysis_output/WGCNA/modules.rds' 

PROTEIN.HEATMAP.OUT.PATH = snakemake@output[[1]]#'Paper_1_Figures/Figure_1/1d_proteomic.png'
GENE.HEATMAP.OUT.PATH = snakemake@output[[2]]#'Paper_1_Figures/Figure_1/1d_transcriptomic.png'

# Create the feature-feature correlation heatmap for the proteins (only using stable features)
png(PROTEIN.HEATMAP.OUT.PATH, res = 72 * 3, height = 3*480, width = 3*480)
eset = readRDS(SOMALOGIC.ESET.IN.PATH)
vp = readRDS(SOMALOGIC.VP.IN.PATH)
modules = readRDS(SOMALOGIC.MODULES.IN.PATH)
features = rownames(vp)[vp$Residuals < .5]
eset = eset[features,]
modules = modules[features]
module.colors = setdiff(unique(modules), 'grey')
modules = factor(modules, levels = c(module.colors, 'grey'))
plot_feature_correlations(eset, modules)
dev.off()

# Create the feature-feature correlation heatmap for the genes (only using stable features)
png(snakemake@output[[2]], res = 72 * 3, height = 3*480, width = 3*480)
eset = readRDS(MICROARRAY.ESET.IN.PATH)
vp = readRDS(MICROARRAY.VP.IN.PATH)
modules = readRDS(MICROARRAY.MODULES.IN.PATH)
features = rownames(vp)[vp$Residuals < .5]
eset = eset[features,]
modules = modules[features]
module.colors = setdiff(unique(modules), 'grey')
modules = factor(modules, levels = c(module.colors, 'grey'))
plot_feature_correlations(eset, modules)
dev.off()

R ggplot2 ggrepel reshape2 Biobase From line 6 of Figure_1/figure_1_addendum_version_2.R

library(ggplot2)
library(dplyr)
library(tidyr)
library(reshape2)
library(Biobase)

#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")

if(!exists("snakemake")){
  setwd("../../..")
  source("scripts/util/paper/parse_snakemake.R")
  parse_snakemake("figure_1c_condition_counts")
}

# Source utilities
source('scripts/util/paper/abbrev_cond.R')

# Set paths
META.DATA.IN.PATH = snakemake@input[["meta"]]#'Metadata/monogenic.de-identified.metadata.RData'
#META.DATA.IN.PATH = 'Metadata/monogenic.de-identified.metadata.RData'

COND.GROUP.IN.PATH <- snakemake@input[["cond_groups"]]#"Reference/condition_groups.csv"

FIGURE.1c.OUT.PATH = snakemake@output[[1]]#"Paper_1_Figures/Figure_1/1c_v2.pdf"
cond_group_dat <- read.csv(COND.GROUP.IN.PATH, stringsAsFactors = FALSE, check.names = FALSE)


# Figure 1c -- stacked barplots displaying counts for conditions, broken down by primary and repeated samples

## Load the monogenic metadata database
load(META.DATA.IN.PATH)

## Get the set of conditions represented in the Discovery and Validation sets (not QC sets)
#conditions = monogenic.all.assays[monogenic.all.assays$analysis_group %in% c('Discovery','Validation'), 'condition']
conditions = monogenic.all.assays[monogenic.all.assays$analysis_group %in% c('Discovery'), 'condition']

## Remove any unknown samples from these conditions
unknowns = c('Unknown', 'U_PI3K', 'U_Kastner', 'U_STAT1', 'U_CTLA4', 'U_Telomere')
conditions = setdiff(conditions, unknowns)

#train_test_count <- monogenic.all.assays %>%
#  filter(assay_type %in% c("Microarray", "Somalogic")) %>%
#  select(patient_id, visit_id, analysis_group, condition) %>%
#  filter(!grepl("CTRL", condition, ignore.case = T)) %>%
#  filter(!grepl("control", condition, ignore.case = T)) %>%
#  mutate(condition2 = condition) %>%
#  mutate(condition = replace(condition, !condition %in% conditions, "Other")) %>%
#  unique() %>% # Ensure there are no repeated visits
#  ##filter(analysis_group %in% 'Discovery') %>% # Filter to just training samples
#  select(patient_id, condition, analysis_group) %>%
#  unique() %>%
#  group_by(condition) %>%
#  summarise(ntrain = sum(analysis_group == "Discovery"), 
#  ntest= sum(analysis_group == "Validation")) %>% as.data.frame()
#
#sum(train_test_count$ntest)


#counts_other = monogenic.all.assays %>%
#  filter(assay_type %in% c("Microarray", "Somalogic")) %>%
#  select(patient_id, visit_id, analysis_group, condition) %>%
#  filter(!grepl("CTRL", condition, ignore.case = T)) %>%
#  filter(!grepl("control", condition, ignore.case = T)) %>%
#  mutate(condition2 = condition) %>%
#  mutate(condition = replace(condition, !condition %in% conditions, "Other")) %>%
#  unique() %>% # Ensure there are no repeated visits
#  ##filter(analysis_group %in% 'Discovery') %>% # Filter to just training samples
#  select(-analysis_group) %>%
#  group_by(condition2) %>%
#  summarise(primary.samples = length(unique(patient_id)), # Count the number of primary samples for each condition
#            other = sum(condition == "Other") != 0,
#            all.samples = length(unique(visit_id))) %>% # Count the number of total samples for each condition
#  mutate(repeat.samples = all.samples - primary.samples) %>% # Get the number of repeat samples for each condtion
#  ungroup() %>%
#  arrange(desc(all.samples)) %>% # Sort by the total number of samples
#  filter(other) %>%
#  select(-other)

## Create a matrix with counts of primary and repeat training set samples for each condition
counts_both_test_train = monogenic.all.assays %>%
  filter(assay_type %in% c("Microarray", "Somalogic")) %>%
  select(patient_id, visit_id, analysis_group, condition) %>%
  filter(!grepl("CTRL", condition, ignore.case = T)) %>%
  filter(!grepl("control", condition, ignore.case = T)) %>%
  #mutate(condition2 = condition) %>%
  #mutate(condition = replace(condition, !condition %in% conditions, "Other")) %>%
  filter(condition %in% c("Healthy", conditions)) %>%
  unique() %>% # Ensure there are no repeated visits
  ##filter(analysis_group %in% 'Discovery') %>% # Filter to just training samples
  group_by(condition, analysis_group) %>%
  summarise(primary.samples = length(unique(patient_id)), # Count the number of primary samples for each condition
            all.samples = length(unique(visit_id))) %>% # Count the number of total samples for each condition
  mutate(repeat.samples = all.samples - primary.samples) %>% # Get the number of repeat samples for each condtion
  ungroup() 

counts_train <- counts_both_test_train %>% 
        filter(analysis_group == "Discovery") %>%
        select(-c(analysis_group, all.samples))
counts_test <- counts_both_test_train %>% filter(analysis_group == "Validation") %>%
        select(condition, primary.samples) %>%
        rename(set.aside.samples =primary.samples)

counts <- left_join(counts_train, counts_test) %>%
        mutate(set.aside.samples = replace(set.aside.samples, is.na(set.aside.samples), 0))

counts <- counts %>%
  group_by(condition) %>%
  left_join(cond_group_dat) %>%
  arrange(desc(primary.samples + set.aside.samples + repeat.samples)) %>% # Sort by the total number of samples
  #select(-all.samples) %>%
  mutate(cond_abbrev = abbrev_cond(condition)) # Change the condition column to be a factor with levels in the same order

counts$cond_abbrev <- factor(counts$cond_abbrev, levels = rev(counts$cond_abbrev))


counts <- counts %>%
  gather(key = variable, value = value, -c(condition, cond_group, cond_abbrev)) %>%
  mutate(variable = as.character(variable)) %>%
  mutate(variable = replace(variable, variable == 'primary.samples', 'Primary Sample')) %>%
  mutate(variable = replace(variable, variable == 'repeat.samples', 'Additional Samples:\nCollected at different visits/timepoints')) %>%
  mutate(variable = replace(variable, variable == 'set.aside.samples', 'Set Aside Sample')) %>%
  mutate(variable = factor(variable, levels = rev(c('Primary Sample', "Set Aside Sample", 'Additional Samples:\nCollected at different visits/timepoints')))) %>%
  ungroup()


counts <- counts %>%
        mutate(cond_group = as.character(cond_group)) %>%
        mutate(cond_group = replace(cond_group, condition == "Healthy", "Healthy")) %>%
        mutate(cond_group = replace(cond_group, cond_group == "TERT.TERC", "Telo")) %>%
        mutate(cond_group = factor(cond_group, levels = c("Healthy", "AI", "Telo", "PID")))


## Create the barplots
#p = ggplot(counts, aes(x = cond_abbrev, y = value, fill = variable)) + 
p = ggplot(counts, aes(x = cond_abbrev, y = value, fill = cond_group, alpha = variable)) + 
        geom_bar(stat = 'identity') + 
  ylab('# Samples') + xlab('Condition') + labs(alpha = 'Sample Type', fill = "Group") + theme_bw() + 
  facet_grid(cond_group~1, scales = "free", space = "free") +
  theme(
    axis.title.x = element_text(size = 15),
    axis.text.x = element_text(size = 15),
    axis.title.y = element_text(size = 15),
    axis.text.y = element_text(size = 15),
    legend.text = element_text(size = 15),
    legend.title = element_text(size = 15),
    strip.text = element_blank(),
    strip.background = element_blank(),
  ) + coord_flip() +
  scale_alpha_manual(values = c(.4, .8, 1))
   #scale_fill_manual(values = c('steelblue2','steelblue4'), guide = guide_legend(reverse = TRUE))

## Save the barplots
ggsave(FIGURE.1c.OUT.PATH, p, device = 'pdf', height = 7, width = 10)

R Snakemake ggplot2 dplyr tidyr reshape2 Biobase From line 1 of Figure_1/figure1_condition_counts.R

library(tidyverse)
library(reshape2)
library(cowplot)

MICROARRAY.MODULES.VP.IN.PATH = snakemake@input[["array_mod"]]#'Data/Microarray/analysis_output/variance_decomposition/microarray_modules_vp.RDS'
SOMALOGIC.MODULES.VP.IN.PATH = snakemake@input[["soma_mod"]]#'Data/Somalogic/analysis_output/variance_decomposition/somalogic_modules_vp.RDS'
TBNKS.VP.IN.PATH = snakemake@input[["tbnk"]]#'Data/TBNK/analysis_output/variance_decomposition/tbnk_features_vp.RDS'

FIG.OUT.PATH <- snakemake@output[[1]]#"Paper_1_Figures/Figure_1/module_tbnk_varpart.pdf"

#setwd("../../..")
#MICROARRAY.MODULES.VP.IN.PATH = 'Pipeline_out/Data/Microarray/analysis_output/variance_decomposition/microarray_modules_vp.RDS'
#SOMALOGIC.MODULES.VP.IN.PATH = 'Pipeline_out/Data/Somalogic/analysis_output/variance_decomposition/somalogic_modules_vp.RDS'
#TBNKS.VP.IN.PATH = 'Pipeline_out/Data/TBNK/analysis_output/variance_decomposition/tbnk_features_vp.RDS'
#
#FIG.OUT.PATH <- "Pipeline_out/Paper_1_Figures/Figure_1/module_tbnk_varpart.pdf"

source("scripts/util/Plotting/tbnk_featurename_replace.R")

# Figure 1f -- stacked bar plots of variance partitions for modules and tbnks

## Load the data
microarray.vp = readRDS(MICROARRAY.MODULES.VP.IN.PATH)
somalogic.vp = readRDS(SOMALOGIC.MODULES.VP.IN.PATH)
tbnks.vp = readRDS(TBNKS.VP.IN.PATH)

## Define the medications we wish to include in the summary variance for medication
medications = c('IgG.replacement', 'Anti.TNF', 'IFN.gamma', 'Immune.stimulators', 
                'Anti.IL1', 'Antifungal', 'Steroid', 'Anti.inflammatories',
                'Antibiotic', 'Immunosuppressant', 'Antibody')

## Instantiate a function to summarize each variance partition into a data frame
extract_results = function(results, medications) {

  ### Convert the variance parititon results to a data frame
  df = data.frame(results)

  ### Create the initial data frame, which displays the variation associated with patient and condition
  ### and which summarizes variance attributed to each medication type into a single score
  df = df %>% 
    tibble::rownames_to_column(var = 'module') %>%
    mutate(Medication = rowSums(as.matrix(df[, medications]))) %>%
    select(-!!medications) %>%
    select(-Residuals) %>%
    mutate(module = factor(module, levels = rev(sort(unique(module))))) %>%
    melt(id.vars = 'module', variable.name = 'Covariate') %>%
    mutate(Covariate = factor(Covariate, levels = rev(c('Patient', 'Condition', 'Medication'))))

  ### Get the amount of variation associated with patient for each feature
  df.patient = df %>% 
    filter(Covariate == 'Patient') %>%
    arrange(value)

  ### Arrange the modules to order by patient-associated variation
  df = df %>%
    mutate(module = module %>% as.character(module)) %>%
    mutate(module = factor(module, levels = unique(df.patient$module)))

  return(df)
}

## Instantiate a function to create the barplot from the extracted results
bar_plot = function(df, colors) {
  p = ggplot(df, aes(x = module, y = value, fill = Covariate)) +
    geom_bar(stat = 'identity', show.legend = TRUE) +
    theme_bw() + ylim(0,1) + coord_flip() + 
    scale_fill_manual(values = colors, guide = guide_legend(reverse = TRUE))
}

## Panel 1 -- TBNKs
### Manually rename the tbnk features names to make them clearer and more concside
df.tbnks = extract_results(tbnks.vp, medications)
#levels(df.tbnks$module) = levels(df.tbnks$module) %>% 
levels(df.tbnks$module) = levels(df.tbnks$module) %>% replace_tbnk_names()

df.tbnks <- df.tbnks %>%
        mutate(category = tbnk_groups(module, "new name"))


### Choose the tbnk plotting colors
colors = c('seagreen1','seagreen3','seagreen4')

### Create the tbnk barplot
p.tbnks = bar_plot(df.tbnks, colors) + 
  #xlab('Major Peripheral \nImmune Parameters') +
  xlab('') + 
  facet_grid(category~1, space = "free", scales = "free_y") +
  ylab('Variance Explained') + 
  theme(
    axis.text.x = element_text(size = 15),
    axis.text.y = element_text(size = 15),
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 15),
    strip.text.y = element_text(size = 15),
    legend.text = element_text(size = 15),
    legend.title = element_text(size = 15),
    strip.text.x = element_blank(),
    strip.background.x = element_blank()
  )

## Panel 2
df.somalogic = extract_results(somalogic.vp, medications)

### Choose somalogic plotting colors
colors = c('thistle1','plum1','violetred')

levels(df.somalogic$module) <- replace_mod_names_single_type(levels(df.somalogic$module), "PM")

### Create the somalogic barplot
p.somalogic = bar_plot(df.somalogic, colors) + 
  #xlab('Proteomic Modules') +
  xlab('') + 
  facet_grid("PM" ~ 1) +
  theme(
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank(),
    axis.text.y = element_text(size = 15),
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 15),
    strip.text.y = element_text(size = 15),
    legend.text = element_text(size = 15),
    legend.title = element_text(size = 15),
    strip.text.x = element_blank(),
    strip.background.x = element_blank()
  )

## Panel 3
df.microarray = extract_results(microarray.vp, medications)
levels(df.microarray$module) <- replace_mod_names_single_type(levels(df.microarray$module), "TM")


### Choose the microarray colors
colors = c('steelblue2','royalblue','royalblue4')

### Create the microarray bar plot
p.microarray = bar_plot(df.microarray, colors) + 
  #xlab('Transcriptomic Modules') + 
  xlab('') + 
  ylab('') + 
  facet_grid("TM" ~ 1) +
  theme(
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank(),
    axis.text.y = element_text(size = 15),
    axis.title.x = element_text(size = 15),
    axis.title.y = element_text(size = 15),
    strip.text.y = element_text(size = 15),
    legend.text = element_text(size = 15),
    legend.title = element_text(size = 15),
    strip.text.x = element_blank(),
    strip.background.x = element_blank()
  )

## Put the panels together
p = plot_grid(p.microarray, p.somalogic, p.tbnks, 
              align = "hv", 
              axis = "tblr",
              nrow = 3, 
              rel_heights = c(nrow(df.microarray), nrow(df.somalogic) + 2, nrow(df.tbnks) + 7))

ggsave(FIG.OUT.PATH, p, device = 'pdf', height = 12, width = 10)

R ggplot2 tidyverse cowplot reshape2 From line 1 of Figure_1/figure1_module_tbnk_vp.R

library(ggplot2)
library(ggrepel)
library(dplyr)
library(tidyr)
library(reshape2)
library(Biobase)
library(cowplot)

# Source utilities
source('scripts/util/paper/abbrev_cond.R')

# Set paths
META.DATA.IN.PATH = snakemake@input[[1]]#'Metadata/monogenic.de-identified.metadata.RData'
MICROARRAY.FEATURES.VP.IN.PATH = snakemake@input[[2]]#'Data/Microarray/analysis_output/variance_decomposition/microarray_features_vp.RDS'
SOMALOGIC.FEATURES.VP.IN.PATH = snakemake@input[[3]]#'Data/Somalogic/analysis_output/variance_decomposition/somalogic_features_vp.RDS'

FIGURE.1e.OUT.PATH = snakemake@output[["e"]]#"Paper_1_Figures/Figure_1/1e.pdf"
FIGURE.1f.OUT.PATH = snakemake@output[["f"]]#"Paper_1_Figures/Figure_1/1f.pdf"

# Figure 1a -- cartoon
## See box: users > Dylan Hirsch > Monogenic Project - Paper 1 > fig1.a.pptx

# Figure 1b -- cartoon, see box
## See box: users > Dylan Hirsch > Monogenic Project - Paper 1 > fig1.b.pptx

# Figure 1d -- feature-feature correlation matrices organized by module for somalogic and microarray, with enrichment-based annotations
## See scripts/Paper_Figures/Figure_1/figure_1_addendum_version_(latest).R for the script used to make the matrix plots
## See Paper_1_Figures/Figure_1/1d_proteomic.png and Paper_1_Figures/Figure_1/1d_transcriptomic.png for the plots themselves.
## These plots were saved as png rather than pdf because they are very large and saving them in high resolution would be prohibitively
## slow and memory intensive.
## See the Enrichments directory (outside the script_dylan folder) for the tables of enrichments.
## These tables are generated using scripts/Enrichments/analysis/write_enrichment_directories.R
## The plots with enrichment annotations can be found in box: users > Dylan Hirsch > Monogenic Project - Paper 1 > fig1.d.pptx

# Figure 1e -- stable versus unstable parameter example
set.seed(80)

## Randomly generate stable trajectories for the 'feature' in two patients
x = .8 ## x is the inital point of the feature for the first patient
xs = x ## xs is the time course of the feature for the first patient
y = .2 ## y is the inital point of the feature for the second patient
ys = y ## ys is the time course of the feature for the second patient
for(i in 1:999) { ## To get from one time point to the next
  ## randomly perturb the feature, with an elastic force pulling it back to its original value
  xs[[i + 1]] = xs[[i]] + .01 * rnorm(1) + .05 * (x - xs[[i]]) 
  ys[[i + 1]] = ys[[i]] + .01 * rnorm(1) + .05 * (y - ys[[i]])
}

## Create a data frame combining the stable trajectories
df1 = data.frame(subject = factor(c(rep(1, 1000), rep(2, 1000))), time = c(1:1000, 1:1000), parameter = c(xs, ys),
                 group = 'stable parameter')

## Randomly generate unstable trajectories for the 'feature' in two patients
x = .52 ## x is the inital point of the feature for the first patient
xs = x ## xs is the time course of the feature for the first patient
y = .48 ## y is the inital point of the feature for the second patient
ys = y ## ys is the time course of the feature for the second patient
for(i in 1:999) { ## To get from one time point to the next
  ## randomly perturb the feature, with an elastic force pulling it back to its original value
  xs[[i + 1]] = xs[[i]] + .05 * rnorm(1) + .05 * (x - xs[[i]])
  ys[[i + 1]] = ys[[i]] + .05 * rnorm(1) + .05 * (y - ys[[i]])
}

## Create a data frame combining the stable trajectories
df2 = data.frame(subject = factor(c(rep(1, 1000), rep(2, 1000))), time = c(1:1000, 1:1000), parameter = c(xs, ys),
                 group = 'unstable parameter')

## Join the stable and unstable trajectory dataframes
df = rbind(df1, df2)

## Make a line plot for each trajectory, separating by stability
p = ggplot(df, aes(x = time, y = parameter, color = subject)) + 
  scale_color_manual(values = c('steelblue2','lightcoral')) + 
  geom_line(show.legend = FALSE) + theme_bw() + facet_wrap(~group, nrow = 2) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.title = element_text(size = 20),
        axis.title.x = element_text(size = 20),
        axis.title.y = element_text(size = 20),
        strip.text = element_text(size = 20)) + ylab('parameter value')

## Save the plot
ggsave(FIGURE.1e.OUT.PATH, p, device = 'pdf', height = 6, width = 9)

# Figure 1e -- violin plots of feature level variance partitioning for microarray and somalogic data

## Load the variance partitions
microarray.vp = readRDS(MICROARRAY.FEATURES.VP.IN.PATH)
somalogic.vp = readRDS(SOMALOGIC.FEATURES.VP.IN.PATH)

## Define the medications we wish to include in the summary variance for medication
medications = c('IgG.replacement', 'Anti.TNF', 'IFN.gamma', 'Immune.stimulators', 
                'Anti.IL1', 'Antifungal', 'Steroid', 'Anti.inflammatories',
                'Antibiotic', 'Immunosuppressant', 'Antibody')

## Instantiate a function to summarize each variance partition into a data frame
summarize_vp = function(results, medications) {
  df = data.frame(results)
  df = df %>%
    tibble::rownames_to_column(var='module') %>%
    mutate(module = as.character(module)) %>%
    mutate(Medication = rowSums(as.matrix(df[, medications]))) %>%
    select(-!!medications) %>%
    melt(id.vars = 'module') %>%
    mutate(variable = factor(variable, levels = c('Patient','Condition','Medication','Residuals')))
  return(df)
}

## Instantiate a function to create violin plot
violin_plot = function(df, colors) {
  ggplot(df, aes(x = variable, y = value, fill = variable)) + theme_bw() + 
    geom_violin(scale = "width", position = position_dodge(.8), width = .7, show.legend = FALSE) +
    scale_fill_manual(values = colors) + ylab('Variance Explained') + xlab('Covariate')
}

## Get the summaried variance partitions
df.microarray = summarize_vp(microarray.vp, medications)
df.somalogic = summarize_vp(somalogic.vp, medications)

## Set colors for protein
colors = c('violetred','plum1','thistle1','grey')

## Make somalogic violin plot
p1 = violin_plot(df.somalogic, colors) + ggtitle('Proteomic Features') +
  theme(
    axis.text.x = element_text(size = 15, angle = 30, hjust = 1),
    axis.title.x = element_text(size = 15),
    axis.text.y = element_text(size = 15),
    axis.title.y = element_text(size = 15),
    title = element_text(size = 15))

## Set colors for microarray 
colors = c('royalblue4','royalblue','steelblue2','grey')

## Make microarray violin plot
p2 = violin_plot(df.microarray, colors) + ggtitle('Transcriptomic Features') +
  theme(
    axis.text.x = element_text(size = 15, angle = 30, hjust = 1),
    axis.title.x = element_text(size = 15),
    axis.text.y = element_blank(),
    axis.title.y = element_blank(),
    axis.ticks.y = element_blank(),
    title = element_text(size = 15))

## Combine the two plots together in a single grid
p = plot_grid(p1, p2, align = "h", ncol = 2, rel_widths = c(10,9))

## Save the results
ggsave(FIGURE.1f.OUT.PATH, p, device = 'pdf', height = 6, width = 9)

R ggplot2 dplyr tidyr cowplot ggrepel reshape2 Biobase From line 2 of Figure_1/figure_1_version_6.R

knitr::opts_chunk$set(echo = TRUE)
#knitr::opts_knit$set(root.dir = normalizePath("../../../"))

R Markdown From line 10 of Figure_1_Statistics/figure_1_stats_version_1.Rmd

library(ggplot2)
library(ggrepel)
library(dplyr)
library(reshape2)
library(Biobase)

R Markdown ggplot2 dplyr ggrepel reshape2 Biobase From line 16 of Figure_1_Statistics/figure_1_stats_version_1.Rmd

if(!exists("snakemake")){
  setwd("../../..")
  source("scripts/util/paper/parse_snakemake.R")
  parse_snakemake(rule = "figure_1_statistics")
}

## Monogenic data base
METADATA.IN.PATH = snakemake@input[[1]] ## "Metadata/monogenic.de-identified.metadata.RData"

R Markdown Snakemake From line 25 of Figure_1_Statistics/figure_1_stats_version_1.Rmd

load(METADATA.IN.PATH)

R Markdown From line 39 of Figure_1_Statistics/figure_1_stats_version_1.Rmd

df = monogenic.all.assays %>%
  select(visit_id, patient_id, patient_age_at_time_of_blood_draw, condition, analysis_group) %>%
  mutate(patient_id = paste0('P', patient_id)) %>%
  mutate(age = patient_age_at_time_of_blood_draw) %>%
  mutate(group = ifelse(condition == "Healthy", "Control", "Case")) %>%
  filter(analysis_group == 'Discovery') %>%
  select(-patient_age_at_time_of_blood_draw, -analysis_group)

R Markdown From line 44 of Figure_1_Statistics/figure_1_stats_version_1.Rmd

df = df %>%
  group_by(visit_id) %>%
  summarise(age = mean(age), patient_id = unique(patient_id), condition = unique(condition), group = unique(group)) %>%
  ungroup() %>%
  group_by(patient_id) %>%
  summarise(age = mean(age), condition = unique(condition), group = unique(group)) %>%
  ungroup()

R Markdown From line 55 of Figure_1_Statistics/figure_1_stats_version_1.Rmd

ks.test(df$age[df$group == 'Control'], df$age[df$group == 'Case'])

R Markdown From line 67 of Figure_1_Statistics/figure_1_stats_version_1.Rmd

74	#load(METADATA.IN.PATH)

R Markdown From line 74 of Figure_1_Statistics/figure_1_stats_version_1.Rmd

df = monogenic.all.assays %>%
  select(patient_id, gender, condition, analysis_group) %>%
  unique() %>%
  mutate(patient_id = paste0('P', patient_id)) %>%
  filter(analysis_group == 'Discovery') %>%
  mutate(group = ifelse(condition == 'Healthy', 'Case', 'Control')) %>%
  group_by(gender, group) %>%
  summarise(total = length(patient_id)) %>%
  ungroup()

R Markdown From line 79 of Figure_1_Statistics/figure_1_stats_version_1.Rmd

X = df %>%
  dcast(gender ~ group, value.var = 'total') %>%
  select(-gender) %>%
  as.matrix()

R Markdown From line 92 of Figure_1_Statistics/figure_1_stats_version_1.Rmd

fisher.test(X)

R Markdown From line 100 of Figure_1_Statistics/figure_1_stats_version_1.Rmd

#dat <- monogenic.all.assays %>%
#        select(patient_id, visit_id, condition, analysis_group, assay_type) %>%
#        filter(analysis_group == "Discovery") %>%
#        filter(assay_type %in% c("Microarray", "Somalogic", ""))

R Markdown From line 104 of Figure_1_Statistics/figure_1_stats_version_1.Rmd

TBNK.IN.PATH <- snakemake@input[["tbnk"]]
SOMA.IN.PATH <- snakemake@input[["soma"]]
ARRAY.IN.PATH <- snakemake@input[["array"]]
TBNK.TEST.IN.PATH <- snakemake@input[["tbnk_test"]]
SOMA.TEST.IN.PATH <- snakemake@input[["soma_test"]]
ARRAY.TEST.IN.PATH <- snakemake@input[["array_test"]]

tbnk <- readRDS(TBNK.IN.PATH)
soma <- readRDS(SOMA.IN.PATH)
array_ <- readRDS(ARRAY.IN.PATH)
tbnk_test <- readRDS(TBNK.TEST.IN.PATH)
soma_test <- readRDS(SOMA.TEST.IN.PATH)
array_test <- readRDS(ARRAY.TEST.IN.PATH)


data_list <- list(tbnk = tbnk, soma = soma, array_ = array_, tbnk_test = tbnk_test,
                   soma_test = soma_test, array_test = array_test)

R Markdown From line 115 of Figure_1_Statistics/figure_1_stats_version_1.Rmd

conditions = monogenic.all.assays[monogenic.all.assays$analysis_group %in% c('Discovery'), 'condition'] %>% unique

## Remove any unknown samples from these conditions
unknowns = c('Unknown', 'U_PI3K', 'U_Kastner', 'U_STAT1', 'U_CTLA4', 'U_Telomere')
conditions = setdiff(conditions, unknowns)

out_list <- list()
for(nm in names(data_list)){
  pdat <- pData(data_list[[nm]])
  pdat <- pdat %>% filter(condition %in% conditions)

  n_subjects <- length(unique(pdat$patient_id))
  n_visits <- length(unique(pdat$visit_id))
  n_repeats <- n_visits - n_subjects 

  n_subj_w_repeats <- sum(table(pdat$patient_id) > 1)

  out_list[[nm]] <- c(n_subjects = n_subjects, n_visits = n_visits, n_repeats = n_repeats, n_subj_w_repeats= n_subj_w_repeats)
  #print(nm)
  #print(vec)
}
mat <- do.call(rbind, out_list)
print(mat)

print("totals")
print(colSums(mat))

print("total number of unique disease patients")
pat_list <- lapply(data_list, function(x){
 pData(x) %>%
         filter(condition %in% conditions) %>%
         filter(condition != "Healthy") %>%
         pull(patient_id)

                   })
length(unique(unlist(pat_list)))

print("There were some additional patients with tbnk data, but no array/somalogic")
pats_with_tbnk_but_no_array_soma <- setdiff(unique(unlist(pat_list[c(1, 4)])), unique(unlist(pat_list[c(2,3,5,6)])))
pats_with_tbnk_but_no_array_soma

pats_with_tbnk_but_no_array_soma <- setdiff(unique(unlist(pat_list[c(1)])), unique(unlist(pat_list[c(2,3,5,6)])))
pats_with_tbnk_but_no_array_soma

print("total number of unique healthy")
healthy_list <- lapply(data_list, function(x){
 pData(x) %>%
         filter(condition == "Healthy") %>%
         pull(patient_id)

                   })
length(unique(unlist(healthy_list)))
setdiff(unique(unlist(healthy_list[c(1, 4)])), unique(unlist(healthy_list[c(2,3,5,6)])))

print("total number of unique visits")
visit_list <- lapply(data_list, function(x){
 pData(x) %>%
         filter(condition %in% conditions) %>%
         pull(visit_id)

                   })
length(unique(unlist(visit_list)))


#monogenic.all.assays %>% 
#        mutate(pid = paste0("P", patient_id)) %>%
#        filter( pid %in% pats_with_tbnk_but_no_array_soma) %>%
#        select(patient_id, assay_type, visit_id)

R Markdown From line 135 of Figure_1_Statistics/figure_1_stats_version_1.Rmd

#conditions = monogenic.all.assays[monogenic.all.assays$analysis_group %in% c('Discovery'), 'condition'] %>% unique
#
### Remove any unknown samples from these conditions
#unknowns = c('Unknown', 'U_PI3K', 'U_Kastner', 'U_STAT1', 'U_CTLA4', 'U_Telomere')
#conditions = setdiff(conditions, unknowns)
#
#counts_both_test_train = monogenic.all.assays %>%
        #filter(assay_type %in% c("Microarray", "Somalogic")| patient_id %in% c(42, 59, 78 171)) %>%
#  select(patient_id, visit_id, analysis_group, condition) %>%
#  filter(!grepl("CTRL", condition, ignore.case = T)) %>%
#  filter(!grepl("control", condition, ignore.case = T)) %>%
#  filter(condition %in% c("Healthy", conditions)) %>%
#  unique() %>% # Ensure there are no repeated visits
#  group_by(analysis_group) %>%
#  summarise(primary.samples = length(unique(patient_id)), # Count the number of primary samples for each condition
#            all.samples = length(unique(visit_id))) %>% # Count the number of total samples for each condition
#  mutate(repeat.samples = all.samples - primary.samples) %>% # Get the number of repeat samples for each condtion
#  ungroup() 
#
#counts_both_test_train
#
#colSums(as.matrix(counts_both_test_train[, -1]))
#```
#
#
#```{r}
#all_pats <- unique(c(tbnk$patient_id, array_$patient_id, soma$patient_id))
#total_pats <- length(all_pats)
#print("Total unique patients")
#print(total_pats)

R Markdown From line 207 of Figure_1_Statistics/figure_1_stats_version_1.Rmd

dat <- monogenic.all.assays %>%
        filter(analysis_group == "Discovery") %>%
        filter(assay_type %in% c("Microarray", "Somalogic"))

tab <- table(monogenic.all.assays$patient_id, monogenic.all.assays$visit_id)
dup_pats <- rownames(tab)[rowSums(tab > 0) > 1]

ranges <- dat %>% 
        filter(patient_id %in% dup_pats) %>%
        group_by(patient_id) %>%
        mutate(blood_draw_date = as.numeric(blood_draw_date)) %>%
        summarise(min_date = min(blood_draw_date), max_date = max(blood_draw_date)) %>%
        mutate(day_range = max_date - min_date) %>%
        filter(day_range > 0)

quantile(ranges$day_range)

R Markdown From line 240 of Figure_1_Statistics/figure_1_stats_version_1.Rmd

library(tidyverse)
library(Biobase)
library(BiocGenerics)

#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project/")

if(!exists("snakemake")){
  setwd("../../..")
  source("scripts/util/paper/parse_snakemake.R")
  parse_snakemake("figure_1_tables_module_members")
}

SOMALOGIC_MODULE_IN_PATH <- snakemake@input[["soma_mod"]]#"Data/Somalogic/analysis_output/wgcna_results/modules.rds"
MICROARRAY_MODULE_IN_PATH <- snakemake@input[["array_mod"]]#"Data/Microarray/analysis_output/WGCNA/modules.rds"


SOMALOGIC_STABILITY_IN_PATH <- snakemake@input[["soma_stability"]]
MARRAY_STABILITY_IN_PATH <- snakemake@input[["array_stability"]]

SOMALOGIC_ESET_IN_PATH <- snakemake@input[["soma_eset"]]
MARRAY_ESET_IN_PATH <- snakemake@input[["array_eset"]]

SOMALOGIC_OUT_PATH <- snakemake@output[["soma"]]#"Paper_1_Figures/Figure_1_Tables/somalogic_module_members.csv"
MICROARRAY_OUT_PATH <- snakemake@output[["array"]]#"Paper_1_Figures/Figure_1_Tables/microarray_module_members.csv"

SOMALOGIC_FEATCOUNT_OUT_PATH <- snakemake@output[["soma_feat_counts"]]#"Paper_1_Figures/Figure_1_Tables/somalogic_module_members.csv"
MICROARRAY_FEATCOUNT_OUT_PATH <- snakemake@output[["array_feat_counts"]]#"Paper_1_Figures/Figure_1_Tables/microarray_module_members.csv"
source("scripts/util/Plotting/tbnk_featurename_replace.R")

soma_mods <- readRDS(SOMALOGIC_MODULE_IN_PATH)

array_mods <- readRDS(MICROARRAY_MODULE_IN_PATH)

soma_stability <- readRDS(SOMALOGIC_STABILITY_IN_PATH)
array_stability <- readRDS(MARRAY_STABILITY_IN_PATH)

soma_stab_feat <- rownames(soma_stability)
array_stab_feat <- rownames(array_stability)

soma_eset <- readRDS(SOMALOGIC_ESET_IN_PATH)
array_eset <- readRDS(MARRAY_ESET_IN_PATH)

soma_featdata <- featureData(soma_eset)@data
array_featdata <- featureData(array_eset)@data

array_featdata <- array_featdata %>%
        rownames_to_column(var = "feature") %>%
        .[, !grepl("UniGene", colnames(.))] %>%
        .[, !grepl("GO", colnames(.))] %>%
        .[, !grepl("Chromosome", colnames(.))] %>%
        select(-c(Nucleotide.Title, Platform_CLONEID, Platform_ORF))

soma_featdata <- soma_featdata %>%
        rownames_to_column(var = "feature") %>%
        select(-c(Units, Type, Dilution))


soma_mods_dat <-
        soma_mods %>%
        enframe(value = "module_color", name = "feature") %>%
        mutate(module_name = replace_mod_names_single_type(module_color, sheet = "PM")) %>%
        mutate(tmp = gsub("grey", "PM9999", module_name)) %>%
        left_join(soma_featdata) %>%
        mutate(stable = feature %in% soma_stab_feat) %>%
        arrange(tmp) %>%
        select(-tmp)

soma_mods_dat %>%
        write_csv(path = SOMALOGIC_OUT_PATH)

array_mods_dat <- array_mods %>%
        enframe(value = "module_color", name = "feature") %>%
        mutate(module_name = replace_mod_names_single_type(module_color, sheet = "TM")) %>%
        left_join(array_featdata) %>%
        mutate(stable = feature %in% array_stab_feat) %>%
        arrange(module_name)

array_mods_dat %>%
        write_csv(path = MICROARRAY_OUT_PATH)


array_feat_counts <- array_mods_dat %>%
        group_by(module_name, module_color, stable) %>%
        summarise(n = n()) %>%
        ungroup %>%
        spread(key = stable, value = n) %>%
        rename(n_stable = "TRUE", n_unstable = "FALSE") %>%
        mutate(n_total = n_stable + n_unstable)

soma_feat_counts <- soma_mods_dat %>%
        group_by(module_name, module_color, stable) %>%
        summarise(n = n()) %>%
        ungroup %>%
        spread(key = stable, value = n) %>%
        rename(n_stable = "TRUE", n_unstable = "FALSE") %>%
        mutate(n_total = n_stable + n_unstable)


write_csv(array_feat_counts, MICROARRAY_FEATCOUNT_OUT_PATH)
write_csv(soma_feat_counts, SOMALOGIC_FEATCOUNT_OUT_PATH)

R Snakemake tidyverse Biobase BiocGenerics From line 1 of Figure_1_Tables/figure_1_tables_module_members.R

library(reshape2)
library(tidyverse)
library(Biobase)

# Set paths
if(exists("snakemake")){
  ENRICHMENTS.IN.PATHS = list(
    protein.gene.sets = snakemake@input[[1]],#'Data/Somalogic/analysis_output/enrichments/somalogic_module_gene_set_enrichments.RDS',
    protein.tissue.sets = snakemake@input[[2]],#'Data/Somalogic/analysis_output/enrichments/somalogic_module_tissue_set_enrichments.RDS',
    gene.gene.sets = snakemake@input[[3]]#'Data/Microarray/analysis_output/enrichments/microarray_module_gene_set_enrichments.RDS'
  )

  META.DATA.IN.PATH = snakemake@input[[4]]#'Metadata/monogenic.de-identified.metadata.RData'

  ENRICHMENT.TABLES.OUT.PATHS = list(
    protein.gene.sets = snakemake@output[[1]],#'Paper_1_Figures/Figure_1_Tables/somalogic_module_gene_set_enrichments_table.txt',
    protein.tissue.sets = snakemake@output[[2]],#'Paper_1_Figures/Figure_1_Tables/somalogic_module_tissues_set_enrichments_table.txt',
    gene.gene.sets = snakemake@output[[3]]#'Paper_1_Figures/Figure_1_Tables/microarray_module_gene_set_enrichments_table.txt'
  )

  DEMOGRAPHICS.TABLE.OUT.PATH = snakemake@output[[4]]#'Paper_1_Figures/Figure_1_Tables/demographics_table.txt'
}else{
  ENRICHMENTS.IN.PATHS = list(
    protein.gene.sets = 'Data/Somalogic/analysis_output/enrichments/somalogic_module_gene_set_enrichments.RDS',
    protein.tissue.sets = 'Data/Somalogic/analysis_output/enrichments/somalogic_module_tissue_set_enrichments.RDS',
    gene.gene.sets = 'Data/Microarray/analysis_output/enrichments/microarray_module_gene_set_enrichments.RDS'
  )

  META.DATA.IN.PATH = 'Metadata/monogenic.de-identified.metadata.RData'

  ENRICHMENT.TABLES.OUT.PATHS = list(
    protein.gene.sets = 'Paper_1_Figures/Figure_1_Tables/somalogic_module_gene_set_enrichments_table.txt',
    protein.tissue.sets = 'Paper_1_Figures/Figure_1_Tables/somalogic_module_tissues_set_enrichments_table.txt',
    gene.gene.sets = 'Paper_1_Figures/Figure_1_Tables/microarray_module_gene_set_enrichments_table.txt'
  )

  DEMOGRAPHICS.TABLE.OUT.PATH = 'Paper_1_Figures/Figure_1_Tables/demographics_table.txt'

  setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")
}

source("scripts/util/Plotting/tbnk_featurename_replace.R")

# Create tables for the enrichment modules
## Load data
enrichments.list = lapply(ENRICHMENTS.IN.PATHS, readRDS)

## For each type of enrichment
for(enrichment.name in names(enrichments.list)) {

  ## Get the enrichments corresponding to that enrichment type
  enrichments = enrichments.list[[enrichment.name]]

  ## Get the table output path for that enrichment type
  enrichment.out.path = ENRICHMENT.TABLES.OUT.PATHS[[enrichment.name]]

  ## For each module in those enrichments
  enrichments = lapply(names(enrichments), function(module) {

    ## Get the enrichments corresponding to that module
    enrichment = enrichments[[module]]

    ## Set the name of the enrichments to the gene set names
    enrichment$Set.Name = rownames(enrichment)

    ## Set the module to the module color
    enrichment$module = module

    ## Rearrang the appended columns to be the first two columns
    n = ncol(enrichment)
    enrichment = enrichment[,c(n, n-1, 1:(n-2))]
  })

  ## Combine all the modules' enrichment data frames
  enrichments = Reduce(rbind, enrichments)

  enrichments <- enrichments %>% rename(module_color = module)

  if(startsWith(enrichment.name, "protein")){
    enrichments <- enrichments %>% 
            mutate(module_name = replace_mod_names_single_type(module_color, sheet = "PM"))
  }else if(startsWith(enrichment.name, "gene")){
    enrichments <- enrichments %>% 
            mutate(module_name = replace_mod_names_single_type(module_color, sheet = "TM"))
  }

  enrichments <- enrichments[, c("module_name", setdiff(colnames(enrichments), "module_name"))]

  if(enrichment.name != "protein.tissue.sets"){
    enrichments <- enrichments %>%
            group_by(module_name) %>%
            mutate(rank = rank(across.Adjusted.Pvalue)) %>%
            filter(rank < 101) %>%
            arrange(across.Adjusted.Pvalue) %>%
            select(-rank)
  }

  enrichments <- enrichments %>%
        select(-Adjusted.Pvalue) %>%
        rename(Adjusted.Pvalue = across.Adjusted.Pvalue) %>%
        mutate(tmp = gsub("grey", "PM9999", module_name)) %>%
        arrange(tmp) %>%
        select(-tmp)

  ## Output the data frames as a table 
  write.table(enrichments, file = enrichment.out.path, row.names = FALSE, sep = '\t', quote = FALSE)
}

# Create tables for demographics
## Load Data
load(META.DATA.IN.PATH)

## ## Get metadata for patients in training set
## Get ages for patients in training set
df = monogenic.all.assays %>%
  select(visit_id, patient_id, patient_age_at_time_of_blood_draw, condition, analysis_group, gender, race, ethnicity) %>%
  mutate(patient_id = paste0('P', patient_id)) %>%
  mutate(age = patient_age_at_time_of_blood_draw) %>%
  mutate(group = ifelse(condition == "Healthy", "Control", "Case")) %>%
  filter(analysis_group == 'Discovery') %>%
  select(-patient_age_at_time_of_blood_draw, -analysis_group)

## Average ages over various visits, first by averaging ages from samples within a visit (should all be almost exactly the same),
## and then averaging ages from samples across visits
df = df %>%
  group_by(visit_id) %>%
  summarise(age = mean(age),
            patient_id = unique(patient_id),
            condition = unique(condition), 
            group = unique(group), 
            gender = unique(gender),
            ethnicity = unique(ethnicity),
            race = unique(race)) %>%
  ungroup() %>%
  group_by(patient_id) %>%
  summarise(age = mean(age),
            condition = unique(condition), 
            group = unique(group), 
            gender = unique(gender),
            ethnicity = unique(ethnicity),
            race = unique(race),
            visits = length(visit_id)) %>%
  ungroup()

## Get the statistics associated with each condition 
## (NOTE: I did not include the IQR/variance for age here so that patient-specific ages
## could not be estimated for conditions with 2 samples)
df = df %>%
  group_by(condition) %>%
  summarise(`number of patients` = length(patient_id),
            `number of samples` = sum(visits),
            `age median` = median(age),
            `percent Female (Sex)` = mean(gender == 'F'),
            `percent Male (Sex)` = mean(gender == 'M'),
            `percent Asian (Race)` = mean(race == 'Asian'),
            `percent Black / African American (Race)` = mean(race == "Black/African Amer"),
            `percent Hawaiian/Pacific Islander (Race)` = mean(race == "Hawaiian/Pac. Island"),
            `percent Unknown (Race)` = mean(race == "Unknown"),
            `percent Multi-Racial (Race)` = mean(race == "Multiple Race"),
            `percent White / Caucasian (Race)` = mean(race == "White"),
            `percent Hispanic or Latino (Ethnicity)` = mean(ethnicity == "Hispanic or Latino"),
            `percent Not Hispanic or Latino (Ethnicity)` = mean(ethnicity == "Not Hispanic or Latino"),
            `percent Unknown (Ethnicity)` = mean(ethnicity == "Unknown")) %>%
  mutate(condition = condition %>% factor(., sort(unique(.))) %>% relevel(., 'Healthy')) %>%
  arrange(condition) %>%
  mutate(`age median` = round(`age median`)) %>%
  as.data.frame()

## Print to table
write.table(df, DEMOGRAPHICS.TABLE.OUT.PATH, sep = '\t', row.names = F, col.names = T)

R Snakemake tidyverse reshape2 Biobase From line 2 of Figure_1_Tables/figure_1_tables_version_1.R

library(Biobase)
library(tidyverse)
library(cowplot)
#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project/")

print(str(snakemake))
# Set paths
RESULTS.IN.PATHS = list(
  somalogic.modules = snakemake@input[["soma_mod_de"]],# 'Data/Somalogic/analysis_output/differential_expression/somalogic_modules_DE_results.rds',
  somalogic.features = snakemake@input[["soma_feat_de"]],# 'Data/Somalogic/analysis_output/differential_expression/somalogic_features_DE_results.rds',
  microarray.modules = snakemake@input[["array_mod_de"]],# 'Data/Microarray/analysis_output/differential_expression/microarray_modules_DE_results.rds',
  microarray.features = snakemake@input[["array_feat_de"]],# 'Data/Microarray/analysis_output/differential_expression/microarray_features_DE_results.rds',
  tbnks = snakemake@input[["tbnk_de"]]# 'Data/TBNK/analysis_output/differential_expression/tbnks_features_DE_results.rds'
)
#RESULTS.IN.PATHS = list(
#  somalogic.modules = 'Data/Somalogic/analysis_output/differential_expression/somalogic_modules_DE_results.rds',
#  somalogic.features = 'Data/Somalogic/analysis_output/differential_expression/somalogic_features_DE_results.rds',
#  microarray.modules = 'Data/Microarray/analysis_output/differential_expression/microarray_modules_DE_results.rds',
#  microarray.features = 'Data/Microarray/analysis_output/differential_expression/microarray_features_DE_results.rds',
#  tbnks = 'Data/TBNK/analysis_output/differential_expression/tbnks_features_DE_results.rds'
#)

source("scripts/util/paper/abbrev_cond.R")

COND.GROUPS.IN.PATH <- snakemake@input[["cond_groups"]]#"Reference/condition_groups.rds"

FIG.OUT.PATH <- snakemake@output[[1]]#"Paper_1_Figures/Figure_2/all_feature_bubble_plot_condition_separated.pdf"

condition_group_dat <- readRDS(COND.GROUPS.IN.PATH)

# Load data
results = lapply(RESULTS.IN.PATHS, readRDS) # Load results from limma DE testing


# Add a data type for the combination of protein modules, gene modules, and tbnks

## Get the names of the versus.healthy/versus.all options for the heatmaps
first.levels = names(results$somalogic.modules)
## Get the names of the statistics associated with the DE testing e.g. (t-stat, p value, etc.)
second.levels = names(results$somalogic.modules$versus.healthy)

## For both of the versus_healthy/versus_all options
results$all = lapply(first.levels, function(l1) {

  ## And for each statistic type associated with the DE testing results
  stats = lapply(second.levels, function(l2) {

    ## Get the matrix corresponding to the given statistic for the protein modules
    a1 = results$somalogic.modules[[l1]][[l2]]
    ## Name the protein modules with a 'protein.' prefix to distinguish them from the gene modules
    rownames(a1) = paste0('protein.', rownames(a1))
    ## Get the matrix corresponding to the given statistic for the gene modules
    a2 = results$microarray.modules[[l1]][[l2]]
    ## Name the gene modules with a 'gene.' prefix to distinguish them from the protein modules
    rownames(a2) = paste0('gene.', rownames(a2))
    ## Get the matrix corresponding to the given statistic for the tbnks
    a3 = results$tbnks[[l1]][[l2]]
    ## Name the tbnks with a 'tbnks.' prefix for consistency with the modules
    rownames(a3) = paste0('tbnks.', rownames(a3))

    ## Ensure the columns are in the same order
    a2 = a2[, colnames(a1)]
    a3 = a3[, colnames(a1)]

    ## Wrap these statistics together into a single matrix
    rbind(a1, a2, a3)
  })

  ## Name the statistics matrices with the corresponding statistic names (e.g. t-statistic, cohen's d, etc.)
  names(stats) = second.levels
  return(stats)
})

## Name the module-tbnk-combined results based on whether the versus.healthy option or versus.all option was used 
names(results$all) = first.levels

## Add the patient-condition relationships associated with all the esets in order to provide the number of samples for each condition
#sample.datas$all = unique(Reduce(rbind, sample.datas))

# Get the condition counts associated with each data type
#condition_counts = lapply(sample.datas, function(sample.data) {
#  counts = table(sample.data$condition)
#  counts = counts[names(counts) != "Healthy"]
#})



# make dataframe for first bubble_plot
joined_dat <- lapply(results$all, function(nested_list){
  lapply(names(nested_list), function(nm){
        mat <- nested_list[[nm]]
        dat <- as.data.frame(mat) %>%
        rownames_to_column(var = "feature") %>%
        gather(key = "condition", value = value, -feature)

         colnames(dat) <- gsub("value", nm, colnames(dat))
        dat
  }) %>% Reduce(f = full_join)
}) %>% bind_rows(.id = "comparison")


joined_dat <- joined_dat %>%
        rename(q_val = adj.P.Val, coeff = effect.size, t_stat = t) %>%
        mutate(q_val_filtered = replace(q_val, q_val > .2, NA)) %>%
        mutate(feat_category = sapply(strsplit(feature, "\\."), `[[`, 1))


hc_row <-results$all$versus.healthy$t %>% 
        dist() %>% hclust()
joined_dat$feature <- factor(joined_dat$feature, levels = hc_row$labels[hc_row$order])

hc_col <-results$all$versus.healthy$t %>% 
        t() %>%
        dist() %>% hclust()
joined_dat$condition <- factor(joined_dat$condition, levels = hc_col$labels[hc_col$order])


#plotting everything all together
feat_types <- c("gene", "protein", "tbnks")
names(feat_types) <- feat_types
hc_row_list <- lapply(feat_types, function(feat_category){
  mat <- results$all$versus.healthy$t 
  mat[startsWith(rownames(mat), feat_category), ] %>% 
          dist() %>% hclust()

})

order_all_feat_within_datatype <- sapply(hc_row_list, function(hc){
  row_order <- hc$labels[hc$order]
}) %>% unlist()

joined_dat <- joined_dat %>%
        mutate(feature2 = factor(feature, levels = order_all_feat_within_datatype))

# separating out by feature type

lymph_terms <- c("cd19", "cd3", "nk_cell", "lymph")
innate_terms <- c("eos", "mono", "baso", "NLR", "wbc")
red_terms <- c("platelet", "mcv", "mch", "mchc", "rdw", "hemoglobin", "rbc")

joined_dat <- joined_dat %>%
        mutate(feat_group2 = feat_category) %>%
        mutate(feat_group2 = replace(feat_group2, 
                                     rowSums(sapply(lymph_terms, grepl, feature)) > 0,"Lymphocytes")
               ) %>%
        mutate(feat_group2 = replace(feat_group2, 
                                     rowSums(sapply(innate_terms, grepl, feature)) > 0,"Innate")
               ) %>%
        mutate(feat_group2 = replace(feat_group2, 
                                     rowSums(sapply(red_terms, grepl, feature)) > 0,"RBC & PLT")
               ) %>%
        mutate(feat_group2 = gsub("gene", "TM", feat_group2)) %>%
        mutate(feat_group2 = gsub("protein", "PM", feat_group2)) %>%
        mutate(feat_group2 = factor(feat_group2, levels = c("TM", "PM", "Innate", "Lymphocytes", "RBC & PLT")))

joined_dat <- joined_dat %>%
        mutate(cond_group = condition_group_dat$cond_group[match(condition, condition_group_dat$condition)])


#Add in the new module names that are not colors
joined_dat <- joined_dat %>%
        mutate(feature3 = feature2)

levels(joined_dat$feature3) <- 
        gsub("tbnks\\.", "", levels(joined_dat$feature3))

source("scripts/util/Plotting/tbnk_featurename_replace.R")

levels(joined_dat$feature3) <- replace_mod_names_both(levels(joined_dat$feature3))
levels(joined_dat$feature3) <- replace_tbnk_names(levels(joined_dat$feature3))

levels(joined_dat$condition) <- abbrev_cond(levels(joined_dat$condition))

pdf(FIG.OUT.PATH)
#Versus healthy- main figure
p <- ggplot(joined_dat %>% filter(comparison == "versus.healthy" & 
                                  !condition %in% c("TERT.TERC", "AI", "PID")), 
            aes(x = condition, y = feature3)) +
        geom_point(aes(size = -log10(q_val), fill = coeff, color = q_val < .2), shape = 21) +
        facet_grid(feat_group2 ~ cond_group, scales = "free", space = "free") +
        scale_fill_gradient2(low = "blue", high = "red") +
        scale_color_manual(values = c("white", "black")) +
        theme_bw() +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
              strip.text.x = element_blank()) +
        ylab("") +
        ggtitle("Versus Healthy")
print(p)


#Versus all - this is supplement
p <- ggplot(joined_dat %>% filter(comparison == "versus.all" &
                                  !condition %in% c("TERT.TERC", "AI", "PID")), 
                                  aes(x = condition, y = feature3)) +
        geom_point(aes(size = -log10(q_val), fill = coeff, color = q_val < .2), shape = 21) +
        facet_grid(feat_group2 ~ cond_group, scales = "free", space = "free") +
        theme_bw() +
        scale_fill_gradient2(low = "blue", high = "red") +
        scale_color_manual(values = c("white", "black")) +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
              strip.text.x = element_blank()) +
        ylab("") +
        ggtitle("Versus All")
print(p)
dev.off()

R ggplot2 tidyverse cowplot Biobase limma From line 2 of Figure_2/figure_2a_version_4.R

library(tidyverse)
library(Biobase)
library(BiocGenerics)

ESETS.IN.PATHS = list(
  somalogic.modules = snakemake@input[[1]],# 'Data/Somalogic/analysis_output/wgcna_results/scores_subject_level.rds',
  somalogic.features = snakemake@input[[2]],# 'Data/Somalogic/data_analysis_ready/analysis_ready_subject_level_training_somalogic.rds',
  microarray.modules = snakemake@input[[3]],# 'Data/Microarray/analysis_output/WGCNA/array_subject_scores.rds',
  microarray.features = snakemake@input[[4]],# 'Data/Microarray/data_analysis_ready/eset_batch_training_subject.rds',
  tbnks = snakemake@input[[5]]# 'Data/TBNK/data_analysis_ready/tbnk_training_subject_level_eset.rds'
)
#ESETS.IN.PATHS = list(
#  somalogic.modules =  'Data/Somalogic/analysis_output/wgcna_results/scores_subject_level.rds',
#  somalogic.features = 'Data/Somalogic/data_analysis_ready/analysis_ready_subject_level_training_somalogic.rds',
#  microarray.modules = 'Data/Microarray/analysis_output/WGCNA/array_subject_scores.rds',
#  microarray.features = 'Data/Microarray/data_analysis_ready/eset_batch_training_subject.rds',
#  tbnks = 'Data/TBNK/data_analysis_ready/tbnk_training_subject_level_eset.rds'
#)

source("scripts/util/paper/abbrev_cond.R")

FIG.OUT.PATH <- snakemake@output[[1]]#"Paper_1_Figures/Figure_2/all_feature_bubble_plot_condition_separated_nsubj_per_comparison.pdf"

esets <- lapply(ESETS.IN.PATHS, readRDS)

# Get the patient-condition pairings from each eset in order to calculate the number of subjects coming from each data type
sample.datas = lapply(esets, function(eset) {
  sample.data = pData(eset)
  sample.data[, c('patient_id','condition')]
})

n_subj_dat <- lapply(sample.datas, function(dat){
  dat %>% group_by(condition) %>%
          summarise(n = n())
}) %>% bind_rows(.id = "feat_category")

n_subj_dat <- n_subj_dat %>%
        filter(feat_category %in% c("somalogic.modules", "microarray.modules", "tbnks")) %>%
        mutate(feat_category = gsub("somalogic.modules", "protein", feat_category)) %>%
        mutate(feat_category = gsub("microarray.modules", "gene", feat_category)) %>%
        mutate(feat_category = gsub("protein", "PM", feat_category)) %>%
        mutate(feat_category = gsub("gene", "TM", feat_category)) %>%
        mutate(feat_category = gsub("tbnks", "CBC + TBNK", feat_category))

sum_n_subj <- n_subj_dat %>%
        group_by(condition) %>%
        summarise(sum_n_subj = sum(n)) %>%
        arrange(sum_n_subj)

cond_order <- sum_n_subj$condition

n_subj_dat <- n_subj_dat %>% mutate(cond_group = group_cond(condition)) %>%
        left_join(sum_n_subj) %>%
        mutate(condition = factor(condition, levels = cond_order))

levels(n_subj_dat$condition) <- abbrev_cond(levels(n_subj_dat$condition))

pdf(FIG.OUT.PATH, height = 2.5, width = 6)
p2 <- ggplot(n_subj_dat %>% filter( !condition %in% c("TERT.TERC", "AI", "PID")), 
             aes(x = condition, feat_category)) +
        geom_tile(aes(fill= log2(n))) +
        geom_text(aes(label = n), color = "white", angle = 90) + 
        facet_grid(1 ~ cond_group, scales = "free", space = "free") +
        ggtitle("number of subjects in each comparison") +
        #scale_fill_viridis_c()+
        scale_fill_gradient(low = "white", high = "black") +
        theme_bw() +
        ylab("") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
              strip.background = element_blank(), strip.text = element_blank())
print(p2)
dev.off()

R ggplot2 tidyverse Biobase BiocGenerics From line 1 of Figure_2/figure_2_n_subjects.R

library(tidyverse)
library(cowplot)
#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project/")

if(!exists("snakemake")){
  setwd("../../..")
  source("scripts/util/paper/parse_snakemake.R")
  parse_snakemake(rule = "figure_2_mod_feature_level_heatmaps")
}

# Set paths
RESULTS.IN.PATHS = list(
  somalogic.modules = snakemake@input[["soma_mod_de"]],# 'Data/Somalogic/analysis_output/differential_expression/somalogic_modules_DE_results.rds',
  somalogic.features = snakemake@input[["soma_feat_de"]],# 'Data/Somalogic/analysis_output/differential_expression/somalogic_features_DE_results.rds',
  microarray.modules = snakemake@input[["array_mod_de"]],# 'Data/Microarray/analysis_output/differential_expression/microarray_modules_DE_results.rds',
  microarray.features = snakemake@input[["array_feat_de"]],# 'Data/Microarray/analysis_output/differential_expression/microarray_features_DE_results.rds',
  tbnks = snakemake@input[["tbnk_de"]]# 'Data/TBNK/analysis_output/differential_expression/tbnks_features_DE_results.rds'
)

source("scripts/util/paper/abbrev_cond.R")


PROTEIN.INFO.IN.PATH = snakemake@input[["protein_info"]]

PROTEIN.MODULES.IN.PATH = snakemake@input[["soma_mods"]]#'Data/Somalogic/analysis_output/wgcna_results/modules.rds'
GENE.MODULES.IN.PATH = snakemake@input[["array_mods"]]#'Data/Microarray/analysis_output/WGCNA/modules.rds'
#PROTEIN.MODULES.IN.PATH = 'Data/Somalogic/analysis_output/wgcna_results/modules.rds'
#GENE.MODULES.IN.PATH = 'Data/Microarray/analysis_output/WGCNA/modules.rds'
COND.GROUPS.IN.PATH <- snakemake@input[["cond_groups"]]#"Reference/condition_groups.rds"

GENE.SETS.IN.PATH = snakemake@input[["genesets"]]#'Gene_sets/processed/combined_gene_sets.RDS'

PURPLE.MOD.OUT.PATH = snakemake@output[["purple_pm2"]]#"Paper_1_Figures/Figure_2/purple_protein_module_bubble_plot.pdf"
RED.TOP.ENRICH.OUT.PATH <- snakemake@output[["red_top_enrich"]]#"Paper_1_Figures/Figure_2/red_module_subcluster_enrich_top10.rds"
RED.MOD.CLUSTER.OUT.DIR <- snakemake@output[["red_subclus_dir"]]#"scratch/red_module_sub_clusters"
RED.DENDRO.OUT.PATH <- snakemake@output[["red_dendro"]]
RED.BUBBLE.OUT.PATH <- snakemake@output[["red_tm1"]]#"Paper_1_Figures/Figure_2/red_module_heatmap.pdf"

#
# Load data
#esets = lapply(ESETS.IN.PATHS, readRDS) # Load expression set data
results = lapply(RESULTS.IN.PATHS, readRDS) # Load results from limma DE testing

protein_info = read.csv(PROTEIN.INFO.IN.PATH, sep = "\t", stringsAsFactors = FALSE)

condition_groups <- readRDS(COND.GROUPS.IN.PATH)

gene.sets <- readRDS(GENE.SETS.IN.PATH)
#read in modules
gene_modules <- readRDS(GENE.MODULES.IN.PATH)
protein_modules <- readRDS(PROTEIN.MODULES.IN.PATH)

## Get the names of the versus.healthy/versus.all options for the heatmaps
first.levels = names(results$somalogic.modules)
## Get the names of the statistics associated with the DE testing e.g. (t-stat, p value, etc.)
second.levels = names(results$somalogic.modules$versus.healthy)

## For both of the versus_healthy/versus_all options
results$all = lapply(first.levels, function(l1) {

  ## And for each statistic type associated with the DE testing results
  stats = lapply(second.levels, function(l2) {

    ## Get the matrix corresponding to the given statistic for the protein modules
    a1 = results$somalogic.modules[[l1]][[l2]]
    ## Name the protein modules with a 'protein.' prefix to distinguish them from the gene modules
    rownames(a1) = paste0('protein.', rownames(a1))
    ## Get the matrix corresponding to the given statistic for the gene modules
    a2 = results$microarray.modules[[l1]][[l2]]
    ## Name the gene modules with a 'gene.' prefix to distinguish them from the protein modules
    rownames(a2) = paste0('gene.', rownames(a2))
    ## Get the matrix corresponding to the given statistic for the tbnks
    a3 = results$tbnks[[l1]][[l2]]
    ## Name the tbnks with a 'tbnks.' prefix for consistency with the modules
    rownames(a3) = paste0('tbnks.', rownames(a3))

    ## Ensure the columns are in the same order
    a2 = a2[, colnames(a1)]
    a3 = a3[, colnames(a1)]

    ## Wrap these statistics together into a single matrix
    rbind(a1, a2, a3)
  })

  ## Name the statistics matrices with the corresponding statistic names (e.g. t-statistic, cohen's d, etc.)
  names(stats) = second.levels
  return(stats)
})

## Name the module-tbnk-combined results based on whether the versus.healthy option or versus.all option was used 
names(results$all) = first.levels

## Add the patient-condition relationships associated with all the esets in order to provide the number of samples for each condition
#sample.datas$all = unique(Reduce(rbind, sample.datas))

# Get the condition counts associated with each data type
#condition_counts = lapply(sample.datas, function(sample.data) {
#  counts = table(sample.data$condition)
#  counts = counts[names(counts) != "Healthy"]
#})



# make dataframe for first bubble_plot
results_t_dat <- results$all$versus.healthy$t %>%
        as.data.frame() %>%
        rownames_to_column(var = "feature") %>%
        gather(key = "condition", value = "t_stat", -feature)

results_q_dat <- results$all$versus.healthy$adj.P.Val %>%
        as.data.frame() %>%
        rownames_to_column(var = "feature") %>%
        gather(key = "condition", value = "q_val", -feature)


joined_dat <- full_join(results_t_dat, results_q_dat) %>%
        mutate(q_val_filtered = replace(q_val, q_val > .2, NA)) %>%
        mutate(feat_category = sapply(strsplit(feature, "\\."), `[[`, 1))

hc_row <-results$all$versus.healthy$t %>% 
        dist() %>% hclust()
joined_dat$feature <- factor(joined_dat$feature, levels = hc_row$labels[hc_row$order])

hc_col <-results$all$versus.healthy$t %>% 
        t() %>%
        dist() %>% hclust()
joined_dat$condition <- factor(joined_dat$condition, levels = hc_col$labels[hc_col$order])


#plotting everything all together
feat_types <- c("gene", "protein", "tbnks")
names(feat_types) <- feat_types
hc_row_list <- lapply(feat_types, function(feat_category){
  mat <- results$all$versus.healthy$t 
  mat[startsWith(rownames(mat), feat_category), ] %>% 
          dist() %>% hclust()

})

order_all_feat_within_datatype <- sapply(hc_row_list, function(hc){
  row_order <- hc$labels[hc$order]
}) %>% unlist()

joined_dat <- joined_dat %>%
        mutate(feature2 = factor(feature, levels = order_all_feat_within_datatype))

#Plot the red module


prep_for_plotting <- function(res_list, keep_features = NULL, condition_groups){

  t_mat <- res_list$t
  coeff_mat <- res_list$effect.size
  q_mat <- res_list$adj.P.Val

  if(!is.null(keep_features)){
    coeff_mat <- coeff_mat[rownames(coeff_mat) %in% keep_features, ]
    t_mat <- t_mat[rownames(t_mat) %in% keep_features, ]
    q_mat <- q_mat[rownames(q_mat) %in% keep_features, ]
  }

  coeff_dat <- coeff_mat %>%
          as.data.frame() %>%
          rownames_to_column(var = "feature") %>%
          gather(key = "condition", value = "coeff", -feature)

  q_dat <- q_mat %>%
          as.data.frame() %>%
          rownames_to_column(var = "feature") %>%
          gather(key = "condition", value = "q_val", -feature)


  #joined <- full_join(t_dat, q_dat) #%>%
  #return(joined)
  joined <- full_join(coeff_dat, q_dat) %>%
          mutate(q_val_filtered = replace(q_val, q_val > .05, NA)) %>%
          left_join(condition_groups)


  hc_row <- t_mat %>% 
          dist() %>% hclust()
  joined$feature_ordered <- factor(joined$feature, levels = hc_row$labels[hc_row$order])

  hc_col <- t_mat %>% 
          t() %>%
          dist() %>% hclust()

  joined$condition_ordered <- factor(joined$condition, levels = hc_col$labels[hc_col$order])

  joined

}

red_genes <- names(gene_modules)[gene_modules == "red"]
red_dat <- prep_for_plotting(results$microarray.features$versus.healthy, 
                             keep_features = red_genes, condition_groups = condition_groups)

red_t_mat <- results$microarray.features$versus.healthy$t
red_t_mat <- red_t_mat[rownames(red_t_mat) %in% red_genes, ]

hc_red <- red_t_mat %>% 
        dist() %>%
        hclust()

pdf(RED.DENDRO.OUT.PATH)
plot(hc_red)
#abline(h = 12)
dev.off()

clusters_red <- cutree(hc_red, k = 3)

clusters_red_list <- lapply(unique(clusters_red), function(i){
  names(clusters_red)[clusters_red == i]                             
})

clusters_red_dat <- clusters_red_list %>% lapply(enframe, name = "name", value = "feature") %>% bind_rows(.id = "cluster")

red_dat <- left_join(red_dat, clusters_red_dat)

source("scripts/util/Enrichment/hyperGeo.R")

# Get set of all genes
universe = rownames(results$microarray.features$versus.healthy$t)


enrichments = lapply(clusters_red_list, function(hits) {
  multiHyperGeoTests(gene.sets, universe, hits, minGeneSetSize = 5, pAdjustMethod = "BH")
})


dir.create(RED.MOD.CLUSTER.OUT.DIR)
dir.create(file.path(RED.MOD.CLUSTER.OUT.DIR, "red_module_sub_clusters"))
for(i in seq_along(clusters_red_list)){
  path <- paste0(RED.MOD.CLUSTER.OUT.DIR, "/red_module_sub_clusters/genes_cluster_",i,".txt")
  writeLines(clusters_red_list[[i]], path)
  path2 <- paste0(RED.MOD.CLUSTER.OUT.DIR, "/red_module_sub_clusters/enrichments_cluster_",i,".csv")
  enrichments[[i]] %>% rownames_to_column(var = "geneset") %>%
  write_csv(path2)
}

top10_list <- lapply(enrichments, function(dat){
  dat %>% 
          rownames_to_column(var = "geneset") %>%
          arrange(across.Adjusted.Pvalue) %>% 
          head(10)
})
saveRDS(top10_list, RED.TOP.ENRICH.OUT.PATH)

show_features <- list(c1 = c(
                             "IFI16", "IFI27L2", "IFI35", "IFI44",
                             "IFI44L", "IFI6", "IFIH1", "IFIT1",  "IFIT2",
                             "IFIT3", "IFIT5", "IFITM1", "IFITM3", "IRF1", 
                             "IRF7", "IRF9", "ISG15"),
                      c2 = c("STAT1", "STAT2", "JAK2", "TRIM14", 
                             "TRIM5", "RIPK2", "POLB")
)

show_features <- unlist(show_features, use.names = FALSE)


ylabs <- replace(levels(red_dat$feature_ordered),
  !levels(red_dat$feature_ordered) %in% show_features,
  ""
)
names(ylabs) <- levels(red_dat$feature_ordered)

levels(red_dat$condition_ordered) <- abbrev_cond(levels(red_dat$condition_ordered))

pdf(RED.BUBBLE.OUT.PATH, height = 4, width =4)
p <- ggplot(red_dat %>% filter(!condition %in% c("CD14")), aes(x = condition_ordered, y = feature_ordered)) +
         geom_tile(aes(fill = coeff)) +
         #scale_color_viridis_c() +
         scale_fill_gradient2(low = "blue", high = "red") +
         facet_grid(cluster~cond_group, scales = "free", space = "free") +
         theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
               #strip.background.y = element_blank(), strip.text.y = element_blank(),
               axis.text.y = element_blank(), axis.ticks.y = element_blank())
print(p)

dev.off()



protein_info_sub <- protein_info %>%
        mutate(feature = make.names(Target)) %>%
        select(feature, Target)
tmp <- results$somalogic.features$versus.healthy

tmp <- lapply(tmp, function(x){
  rn <- protein_info_sub$Target[match(rownames(x), protein_info_sub$feature)]
  rownames(x) <- rn
  x
})

purple_genes <- names(protein_modules)[protein_modules == "purple"]
purple_genes <- protein_info_sub$Target[match(purple_genes, protein_info_sub$feature)]

purple_dat <- prep_for_plotting(tmp,
                                keep_features = purple_genes, 
                                condition_groups = condition_groups)

levels(purple_dat$feature_ordered)[levels(purple_dat$feature_ordered) == "MIG"] <-
        "CXCL9/MIG"

levels(purple_dat$condition_ordered) <- abbrev_cond(levels(purple_dat$condition_ordered))

pdf(PURPLE.MOD.OUT.PATH, height = 6, width = 6)
p <- ggplot(purple_dat %>% filter(!condition %in% c("TERT.TERC", "AI", "PID")), aes(x = condition_ordered, y = feature_ordered)) +
        #geom_point(aes(size = -log10(q_val), color = coeff)) +
        geom_point(aes(size = -log10(q_val), fill = coeff, color = q_val < .2), shape = 21) +
        #scale_color_gradient2(low = "blue", high = "red") +
        scale_fill_gradient2(low = "blue", high = "red") +
        scale_color_manual(values = c("white", "black")) +
        theme_bw() +
        facet_grid(1~cond_group, scales = "free", space = "free") +
        ylab("") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
              strip.background = element_blank(), strip.text = element_blank())
print(p)

dev.off()

R Snakemake ggplot2 tidyverse cowplot limma From line 3 of Figure_2/figure_2_version_2.R

library(tidyverse)
library(Biobase)
library(ggpubr)

#-------
#setwd("../../..")
#PROTEIN_IN_PATH <- "Pipeline_out/Data/Somalogic/data_analysis_ready/analysis_ready_subject_level_training_somalogic.rds"
#SOMALOGIC_DE_RESULTS_IN_PATH <- 'Pipeline_out/Data/Somalogic/analysis_output/differential_expression/somalogic_features_DE_results.rds'
#FIG.OUT.PATH <- "Pipeline_out/Paper_1_Figures/Figure_2/IL23.pdf"
#-------

PROTEIN_IN_PATH <- snakemake@input[["soma_feat_data"]]#"Data/Somalogic/data_analysis_ready/analysis_ready_subject_level_training_somalogic.rds"

SOMALOGIC_DE_RESULTS_IN_PATH <- snakemake@input[["soma_feat_de"]]#'Data/Somalogic/analysis_output/differential_expression/somalogic_features_DE_results.rds'

FIG.OUT.PATH <- snakemake@output[[1]]#"Paper_1_Figures/Figure_2/IL23.pdf"

source("scripts/util/paper/abbrev_cond.R")

protein_results <- readRDS(SOMALOGIC_DE_RESULTS_IN_PATH)

protein_eset <- readRDS(PROTEIN_IN_PATH)

soma_q_dat <- protein_results$versus.healthy$adj.P.Val %>%
        as.data.frame() %>%
        rownames_to_column(var = "feature") %>%
        gather(key = "condition", value = "q_val", -feature)

asterisk_vec <- function(p_vec){
  out_vec <- rep("", length(p_vec))
  out_vec <- replace(out_vec, p_vec < .05, "*")
  out_vec <- replace(out_vec, p_vec < .01, "**")
  out_vec <- replace(out_vec, p_vec < .001, "***")
  out_vec
}

IL23_signif <- soma_q_dat %>%
        filter(feature =="IL.23") %>%
        mutate(IL23_signif= asterisk_vec(q_val))


protein_meta <- protein_eset %>%
        pData() %>%
        select(patient_id, gender, Age, condition) %>%
        mutate(IFN.g.protein = exprs(protein_eset)["IFN.g", ]) %>%
        mutate(IL23 = exprs(protein_eset)["IL.23", ])

protein_meta <- protein_meta %>%
        left_join(IL23_signif %>% select(condition, IL23_signif))

plot_dat <- protein_meta

IL23_medians <- plot_dat %>% group_by(condition) %>%
        summarise(med_IL23 = median(IL23)) %>%
        deframe()

plot_dat <- plot_dat %>%
        mutate(cond_group = group_cond(condition))

plot_dat <- plot_dat %>%
        mutate(condition = factor(condition, levels = names(sort(IL23_medians))))


levels(plot_dat$condition) <- abbrev_cond(levels(plot_dat$condition))

pdf(FIG.OUT.PATH, height = 3.5, width = 3.5)
p <- ggplot(plot_dat, aes(x = condition, y = IL23, fill = cond_group)) +
        geom_boxplot(outlier.shape = NA) +
        #geom_jitter(height = 0) +
        ggbeeswarm::geom_beeswarm() +
        geom_text(aes(x = condition, y = max(IL23) *1.1, label = IL23_signif), size = 6, color = "red", position = position_nudge(x = -.5)) +
        ylim(min(plot_dat$IL23), max(plot_dat$IL23) * 1.12) +
        facet_grid(condition == "Healthy" ~ 1, scales = "free_y", space = "free_y") +
        coord_flip() +
        theme_bw() +
        theme(legend.position = "none", 
              strip.background = element_blank(),
              strip.text.x = element_blank(), 
              strip.text.y = element_blank(),
              panel.spacing = unit(0, "lines"))
print(p)
dev.off()

R ggplot2 tidyverse ggpubr Biobase From line 1 of Figure_2/IL23_box_and_corr_w_ifn_gamma.R

library(tidyverse)
library(Biobase)
library(ggpubr)

#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")

TBNK_IN_PATH <- snakemake@input[["tbnk_data"]]#"Data/TBNK/data_analysis_ready/tbnk_training_sample_level_eset.rds"

PROTEIN_IN_PATH <- snakemake@input[["soma_feat_data"]]#"Data/Somalogic/data_analysis_ready/analysis_ready_subject_level_training_somalogic.rds"

FIG.OUT.PATH <- snakemake@output[[1]]#"Paper_1_Figures/Figure_2/IL23_tbnk_cor_select_features.pdf"

tbnk_eset <- readRDS(TBNK_IN_PATH)
protein_eset <- readRDS(PROTEIN_IN_PATH)

tbnk_meta <- tbnk_eset %>%
        pData() %>%
        select(patient_id, gender, condition)

keep_tbnk_features <- grep("abs", featureNames(tbnk_eset), value = TRUE)
tbnk_meta <- bind_cols(tbnk_meta, as.data.frame(t(exprs(tbnk_eset)[keep_tbnk_features, ])))

protein_meta <- protein_eset %>%
        pData() %>%
        select(patient_id, gender, Age, condition) %>%
        mutate(IFN.g.protein = exprs(protein_eset)["IFN.g", ]) %>%
        mutate(IL23 = exprs(protein_eset)["IL.23", ])

protein_meta %>%
        filter(condition == "DADA2") %>% 
        arrange(IL23)


plot_dat <- left_join(protein_meta, tbnk_meta)

plot_dat_long <- plot_dat %>%
        gather(key = "feature", value = "value", -c("patient_id", "gender", "Age", "condition", "IL23"))

keep_features <- c("platelet_abs", "neutrophil_abs", "cd19_abs", "IFN.g.protein")

#df$genes <- factor(df$genes, levels = c("BA","MLL","pos","neg","PMLalpha+"),
# ordered = TRUE, labels=c("BA","MLL","pos","neg",expression(paste("PML", alpha,"+"))))
labs <-  c(expression(paste("PLT")), 
           expression(paste("Neutrophil (#)")), 
           expression(paste("CD19+ B Cells (#)")), 
           expression(paste("IFN-", gamma, " Protein")))
#labs <- c(lev[1:3], )
plot_dat_long_sub <- plot_dat_long %>% 
        filter(condition == "DADA2", feature %in% keep_features)%>%
        mutate(feature = factor(feature, levels = keep_features, labels = labs, ordered = TRUE))

pdf(FIG.OUT.PATH, height = 4, width = 4)
p2 <- ggplot(plot_dat_long_sub, aes(x = IL23, y = value)) +
        geom_point() +
        geom_smooth(method = "lm", se = FALSE) +
        stat_cor(size = 3) +
        facet_wrap(~ feature, scales = "free", nrow = 2, labeller = label_parsed) +
        theme_bw() +
        ggtitle("DADA2 only")
print(p2)
dev.off()

R ggplot2 tidyverse ggpubr Biobase From line 1 of Figure_2/IL23_corr_w_tbnk.R

library(tidyverse)
library(Biobase)
library(ggpubr)


TBNK_IN_PATH <- snakemake@input[["tbnk_data"]]#"Data/TBNK/data_analysis_ready/tbnk_training_sample_level_eset.rds"
PROTEIN_IN_PATH <- snakemake@input[["soma_mod_scores"]]#"Data/Somalogic/analysis_output/wgcna_results/scores_sample_level.rds"
GENE_IN_PATH <- snakemake@input[["array_mod_scores"]]#"Data/Microarray/analysis_output/WGCNA/array_sample_scores.rds"


RESULTS.IN.PATHS = list(
  somalogic.modules = snakemake@input[["soma_mod_de"]],# 'Data/Somalogic/analysis_output/differential_expression/somalogic_modules_DE_results.rds',
  somalogic.features = snakemake@input[["soma_feat_de"]],# 'Data/Somalogic/analysis_output/differential_expression/somalogic_features_DE_results.rds',
  microarray.modules = snakemake@input[["array_mod_de"]],# 'Data/Microarray/analysis_output/differential_expression/microarray_modules_DE_results.rds',
  microarray.features = snakemake@input[["array_feat_de"]],# 'Data/Microarray/analysis_output/differential_expression/microarray_features_DE_results.rds',
  tbnks = snakemake@input[["tbnk_de"]]# 'Data/TBNK/analysis_output/differential_expression/tbnks_features_DE_results.rds'
)

FIG.OUT.PATH <- snakemake@output[[1]]#"Paper_1_Figures/Figure_2/boxplots_of_select_features_v2.pdf"
#setwd("../../..")
#TBNK_IN_PATH <- "Pipeline_out/Data/TBNK/data_analysis_ready/tbnk_training_sample_level_eset.rds"
#PROTEIN_IN_PATH <- "Pipeline_out/Data/Somalogic/analysis_output/wgcna_results/scores_sample_level.rds"
#GENE_IN_PATH <- "Pipeline_out/Data/Microarray/analysis_output/WGCNA/array_sample_scores.rds"
#RESULTS.IN.PATHS = list(
#  somalogic.modules = 'Pipeline_out/Data/Somalogic/analysis_output/differential_expression/somalogic_modules_DE_results.rds',
#  somalogic.features = 'Pipeline_out/Data/Somalogic/analysis_output/differential_expression/somalogic_features_DE_results.rds',
#  microarray.modules = 'Pipeline_out/Data/Microarray/analysis_output/differential_expression/microarray_modules_DE_results.rds',
#  microarray.features = 'Pipeline_out/Data/Microarray/analysis_output/differential_expression/microarray_features_DE_results.rds',
#  tbnks = 'Pipeline_out/Data/TBNK/analysis_output/differential_expression/tbnks_features_DE_results.rds'
#)
#FIG.OUT.PATH <- "Pipeline_out/Paper_1_Figures/Figure_2/boxplots_of_select_features_v2.pdf"

source("scripts/util/paper/abbrev_cond.R")

tbnk_eset <- readRDS(TBNK_IN_PATH)
protein_eset <- readRDS(PROTEIN_IN_PATH)
gene_eset <- readRDS(GENE_IN_PATH)

# Load data
#esets = lapply(ESETS.IN.PATHS, readRDS) # Load expression set data
results = lapply(RESULTS.IN.PATHS, readRDS) # Load results from limma DE testing



# Add a data type for the combination of protein modules, gene modules, and tbnks

## Get the names of the versus.healthy/versus.all options for the heatmaps
first.levels = names(results$somalogic.modules)
## Get the names of the statistics associated with the DE testing e.g. (t-stat, p value, etc.)
second.levels = names(results$somalogic.modules$versus.healthy)

## For both of the versus_healthy/versus_all options
results$all = lapply(first.levels, function(l1) {

  ## And for each statistic type associated with the DE testing results
  stats = lapply(second.levels, function(l2) {

    ## Get the matrix corresponding to the given statistic for the protein modules
    a1 = results$somalogic.modules[[l1]][[l2]]
    ## Name the protein modules with a 'protein.' prefix to distinguish them from the gene modules
    rownames(a1) = paste0('protein.', rownames(a1))
    ## Get the matrix corresponding to the given statistic for the gene modules
    a2 = results$microarray.modules[[l1]][[l2]]
    ## Name the gene modules with a 'gene.' prefix to distinguish them from the protein modules
    rownames(a2) = paste0('gene.', rownames(a2))
    ## Get the matrix corresponding to the given statistic for the tbnks
    a3 = results$tbnks[[l1]][[l2]]
    ## Name the tbnks with a 'tbnks.' prefix for consistency with the modules
    rownames(a3) = paste0('tbnks.', rownames(a3))

    ## Ensure the columns are in the same order
    a2 = a2[, colnames(a1)]
    a3 = a3[, colnames(a1)]

    ## Wrap these statistics together into a single matrix
    rbind(a1, a2, a3)
  })

  ## Name the statistics matrices with the corresponding statistic names (e.g. t-statistic, cohen's d, etc.)
  names(stats) = second.levels
  return(stats)
})

## Name the module-tbnk-combined results based on whether the versus.healthy option or versus.all option was used 
names(results$all) = first.levels

## Add the patient-condition relationships associated with all the esets in order to provide the number of samples for each condition
#sample.datas$all = unique(Reduce(rbind, sample.datas))

# Get the condition counts associated with each data type
#condition_counts = lapply(sample.datas, function(sample.data) {
#  counts = table(sample.data$condition)
#  counts = counts[names(counts) != "Healthy"]
#})



results_q_dat <- results$all$versus.healthy$adj.P.Val %>%
        as.data.frame() %>%
        rownames_to_column(var = "feature") %>%
        gather(key = "condition", value = "q_val", -feature)


asterisk_vec <- function(p_vec){
  out_vec <- rep("", length(p_vec))
  out_vec <- replace(out_vec, p_vec < .05, "*")
  out_vec <- replace(out_vec, p_vec < .01, "**")
  out_vec <- replace(out_vec, p_vec < .001, "***")
  out_vec
}

nk_cell_signif <- results_q_dat %>%
        filter(feature =="tbnks.nk_cells_abs") %>%
        mutate(nk_cell_signif = asterisk_vec(q_val))

rdw_signif <- results_q_dat %>%
        filter(feature =="tbnks.rdw") %>%
        mutate(rdw_signif = asterisk_vec(q_val))

protein_magenta_signif <- results_q_dat %>%
        filter(feature =="protein.magenta") %>%
        mutate(protein_magenta_signif = asterisk_vec(q_val))

protein_purple_signif <- results_q_dat %>%
        filter(feature =="protein.purple") %>%
        mutate(protein_purple_signif = asterisk_vec(q_val))

gene_yellow_signif <- results_q_dat %>%
        filter(feature =="gene.yellow") %>%
        mutate(gene_yellow_signif = asterisk_vec(q_val))

gene_magenta_signif <- results_q_dat %>%
        filter(feature =="gene.magenta") %>%
        mutate(gene_magenta_signif = asterisk_vec(q_val))

signif_dat_list <- list(nk_cell_signif, rdw_signif, protein_magenta_signif, gene_yellow_signif, protein_purple_signif, gene_magenta_signif)
signif_dat_list <- lapply(signif_dat_list, function(dat){
  dat[, c("condition", grep("signif", colnames(dat), value = TRUE))]
})

signif_dat <- Reduce(full_join, signif_dat_list)


tbnk_meta <- tbnk_eset %>%
        pData() %>%
        select(patient_id, gender, condition) %>%
        mutate(nk_cells_abs = exprs(tbnk_eset)["nk_cells_abs", ]) %>%
        mutate(rdw = exprs(tbnk_eset)["rdw", ]) %>%
        left_join(signif_dat) %>%
        mutate(cond_group = group_cond(condition)) %>%
        mutate(condition = factor(condition))

protein_meta <- protein_eset %>%
        pData() %>%
        select(patient_id, gender, condition) %>%
        mutate(PM6 = exprs(protein_eset)["magenta", ]) %>%
        mutate(PM2 = exprs(protein_eset)["purple", ]) %>%
        left_join(signif_dat) %>%
        mutate(cond_group = group_cond(condition)) %>%
        mutate(condition = factor(condition))

gene_meta <- gene_eset %>%
        pData() %>%
        select(patient_id, gender, condition) %>%
        mutate(TM2= exprs(gene_eset)["yellow", ]) %>%
        mutate(TM6 = exprs(gene_eset)["magenta", ]) %>%
        left_join(signif_dat) %>%
        mutate(cond_group = group_cond(condition)) %>%
        mutate(condition = factor(condition))

levels(gene_meta$condition) <- abbrev_cond(levels(gene_meta$condition))
levels(protein_meta$condition) <- abbrev_cond(levels(protein_meta$condition))
levels(tbnk_meta$condition) <- abbrev_cond(levels(tbnk_meta$condition))


p1 <- ggplot(tbnk_meta, aes(y = nk_cells_abs, 
                            x = reorder(condition, nk_cells_abs, median))) +
        geom_boxplot(outlier.shape = NA, aes(fill = cond_group)) +
        ggbeeswarm::geom_beeswarm(alpha = .5, size = 1) +
        geom_text(aes(x = condition, y = max(nk_cells_abs) * 1.1, label = nk_cell_signif), size = 6, color = "red", position = position_nudge(x = -.5)) +
        ylim(min(tbnk_meta$nk_cells_abs),max(tbnk_meta$nk_cells_abs) * 1.2) +
        theme_bw() +
        ggtitle("NK cells (#)") +
        ylab("") + 
        coord_flip()+
        xlab("") + 
        facet_grid(condition == "Healthy" ~ 1, scales = "free_y", space = "free_y") +
        theme(legend.position = "none", 
              strip.background = element_blank(),
              strip.text.x = element_blank(), 
              strip.text.y = element_blank(),
              panel.spacing = unit(0, "lines"))

p2 <- ggplot(tbnk_meta, aes(y = rdw, x = reorder(condition, rdw , median)))+
                            #color = condition == "Healthy")) +
        #geom_boxplot(outlier.shape = NA) +
        geom_boxplot(outlier.shape = NA, aes(fill = cond_group)) +
        geom_text(aes(x = condition, y = max(rdw) * 1.1, label = rdw_signif), size = 6, color = "red", position = position_nudge(x = -.5)) +
        ylim(min(tbnk_meta$rdw),max(tbnk_meta$rdw) * 1.13) +
        #geom_jitter(width = 0, alpha = .5) +
        ggbeeswarm::geom_beeswarm(alpha = .5, size = 1) +
        theme_bw() +
        coord_flip()+
        ggtitle("RDW") +
        ylab("") + 
        xlab("") + 
        facet_grid(condition == "Healthy" ~ 1, scales = "free_y", space = "free_y") +
        theme(legend.position = "none", 
              strip.background = element_blank(),
              strip.text.x = element_blank(), 
              strip.text.y = element_blank(),
              panel.spacing = unit(0, "lines"))

p3 <- ggplot(gene_meta, aes(y = TM6, x = reorder(condition, TM6, median)))+
                            #color = condition == "Healthy")) +
        #geom_boxplot(outlier.shape = NA) +
        geom_boxplot(outlier.shape = NA, aes(fill = cond_group)) +
        geom_text(aes(x = condition, y = max(TM6) * 1.1, label = gene_magenta_signif), size = 6, color = "red", position = position_nudge(x = -.5)) +
        ylim(min(gene_meta$TM6),max(gene_meta$TM6) * 1.2) +
        #geom_jitter(width = 0, alpha = .5) +
        ggbeeswarm::geom_beeswarm(alpha = .5, size = 1) +
        xlab("TM6: CD8/NK cells") +
        coord_flip()+
        theme_bw() +
        facet_grid(condition == "Healthy" ~ 1, scales = "free_y", space = "free_y") +
        ggtitle("TM6: CD8/NK cells") +
        ylab("") + 
        xlab("") + 
        #scale_color_manual(values = c("black", "red")) + 
        theme(legend.position = "none", 
              strip.background = element_blank(),
              strip.text.x = element_blank(), 
              strip.text.y = element_blank(),
              panel.spacing = unit(0, "lines"))

p4 <- ggplot(gene_meta, aes(y = TM2, x = reorder(condition, TM2, median)))+
                            #color = condition == "Healthy")) +
        #geom_boxplot(outlier.shape = NA) +
        geom_boxplot(outlier.shape = NA, aes(fill = cond_group)) +
        geom_text(aes(x = condition, y = max(TM2) * 1.1, label = gene_yellow_signif), size = 6, color = "red", position = position_nudge(x = -.5)) +
        ylim(min(gene_meta$TM2),max(gene_meta$TM2) * 1.2) +
        #geom_jitter(width = 0, alpha = .5) +
        ggbeeswarm::geom_beeswarm(alpha = .5, size = 1) +
        xlab("TM2: heme/RBC score") +
        theme_bw() +
        coord_flip()+
        facet_grid(condition == "Healthy" ~ 1, scales = "free_y", space = "free_y") +
        ggtitle("TM2: heme/RBC") +
        xlab("") + 
        ylab("") + 
        #scale_color_manual(values = c("black", "red")) + 
        theme(legend.position = "none", 
              strip.background = element_blank(),
              strip.text.x = element_blank(), 
              strip.text.y = element_blank(),
              panel.spacing = unit(0, "lines"))

p5 <- ggplot(protein_meta, aes(y = PM6, x = reorder(condition, PM6, median))) +
                               #color = condition == "Healthy")) +
        #geom_boxplot(outlier.shape = NA) +
        geom_boxplot(outlier.shape = NA, aes(fill = cond_group)) +
        geom_text(aes(x = condition, y = max(PM6) * 1.1, label = protein_magenta_signif), size = 6, color = "red", position = position_nudge(x = -.5)) +
        ylim(min(protein_meta$PM6),max(protein_meta$PM6) * 1.22) +
        #geom_jitter(width = 0, alpha = .5) +
        ggbeeswarm::geom_beeswarm(alpha = .5, size = 1) +
        xlab("PM6: platelets score") +
        ggtitle("PM6: platelets") + 
        facet_grid(condition == "Healthy" ~ 1, scales = "free_y", space = "free_y") +
        coord_flip()+
        theme_bw() +
        ylab("") + 
        xlab("") + 
        #scale_color_manual(values = c("black", "red")) + 
        theme(legend.position = "none", 
              strip.background = element_blank(),
              strip.text.x = element_blank(), 
              strip.text.y = element_blank(),
              panel.spacing = unit(0, "lines"))

p6 <- ggplot(protein_meta, aes(y = PM2, x = reorder(condition, PM2, median))) +
                               #color = condition == "Healthy")) +
        #geom_boxplot(outlier.shape = NA) +
        geom_boxplot(outlier.shape = NA, aes(fill = cond_group)) +
        geom_text(aes(x = condition, y = max(PM2) * 1.1, label = protein_purple_signif), size = 6, color = "red", position = position_nudge(x = -.5)) +
        ylim(min(protein_meta$PM2),max(protein_meta$PM2) * 1.2) +
        #geom_jitter(width = 0, alpha = .5) +
        ggbeeswarm::geom_beeswarm(alpha = .5, size = 1) +
        xlab("PM2") +
        ggtitle("PM2") + 
        facet_grid(condition == "Healthy" ~ 1, scales = "free_y", space = "free_y") +
        coord_flip()+
        theme_bw() +
        ylab("") + 
        xlab("") + 
        #scale_color_manual(values = c("black", "red")) + 
        theme(legend.position = "none", 
              strip.background = element_blank(),
              strip.text.x = element_blank(), 
              strip.text.y = element_blank(),
              panel.spacing = unit(0, "lines"))


#file.remove(FIG.OUT.PATH)
#pdf(FIG.OUT.PATH, height = 9.6, width = 5.5)
#print(p1)
#print(p2)
#print(p3)
#print(p4)
#print(p5)
#print(p6)
#dev.off()
p <- cowplot::plot_grid(p1, p2, p3, p4, p5, p6, nrow = 3)
ggsave(plot =p, filename = FIG.OUT.PATH, height = 9.6, width = 5.7)

R ggplot2 tidyverse ggpubr Biobase limma From line 1 of Figure_2/module_and_tbnk_boxplots_highlight.R

library(limma)
library(dplyr)

GENE.IN.PATH <- snakemake@input[["tm1"]]
PROTEIN.IN.PATH <- snakemake@input[["prot"]]

GENE.DAT.OUT.PATH <- snakemake@output[["tm1"]]
PROTEIN.DAT.OUT.PATH <- snakemake@output[["prot"]]

gene <- readRDS(GENE.IN.PATH)

protein <- readRDS(PROTEIN.IN.PATH)

cmat <- makeContrasts(
                      groupXCGD-groupSTAT1.GOF, 
                      group47CGD-groupSTAT1.GOF, 
                      (groupXCGD + group47CGD)/2 -groupSTAT1.GOF, 
                      levels = colnames(gene)

)

gene_cfit <- contrasts.fit(fit = gene, contrasts = cmat)
gene_cfit <- eBayes(gene_cfit)

protein_cfit = contrasts.fit(fit = protein, contrasts = cmat)
protein_cfit <- eBayes(protein_cfit)

topTable(gene_cfit, coef = 1)

xcgd_gene_dat <- topTable(gene_cfit, coef = 1, number = Inf)
forty7cgd_gene_dat <- topTable(gene_cfit, coef = 2, number = Inf)

xcgd_prot_dat <- topTable(protein_cfit, coef = 1, number = Inf)
forty7cgd_prot_dat <- topTable(protein_cfit, coef = 2, number = Inf)


#I.TAC and IFIT1, STAT1
#grep("TAC", prot_dat$Target, ignore.case = T, value = T)
#
#grep("ifi", prot_dat$Target, ignore.case = T, value = T)
#grep("ifn", prot_dat$Target, ignore.case = T, value = T)
#grep("p56", prot_dat$Target, ignore.case = T, value = T)
#grep("inter", prot_dat$Target, ignore.case = T, value = T)
#grep("isg", prot_dat$Target, ignore.case = T, value = T)
#grep("56", prot_dat$Target, ignore.case = T, value = T)
#
#grep("STAT1", prot_dat$Target, ignore.case = T, value = T)


gene_dat <- 
        list(`X-CGD - STAT1 GOF` = xcgd_gene_dat, 
             `p47-CGD - STAT1 GOF` = forty7cgd_gene_dat) %>%
        bind_rows(.id = "comparison") %>%
        filter(module_name == "red") %>%
        mutate(module_name = "TM1: Interferon")


prot_dat <- 
        list(`X-CGD - STAT1 GOF` = xcgd_prot_dat, 
             `p47-CGD - STAT1 GOF` = forty7cgd_prot_dat) %>%
        bind_rows(.id = "comparison") %>%
        filter(Target %in% c("I-TAC", "STAT1")) %>%
        select(-c(Units, Dilution, Type, Organism))

readr::write_csv(prot_dat, PROTEIN.DAT.OUT.PATH)
readr::write_csv(gene_dat, GENE.DAT.OUT.PATH)

R dplyr From line 1 of Figure_2_Statistics/cgd_stat1_ifn.R

knitr::opts_chunk$set(echo = TRUE)
#knitr::opts_knit$set(root.dir = normalizePath("../../../"))

R Markdown From line 8 of Figure_2_Statistics/supplemental_figure_2_other_classifiers_table.Rmd

# Load libraries
library(ggplot2)
library(gridExtra)
library(ggrepel)
library(ggpubr)
library(dplyr)
library(tidyr)
library(reshape2)

if(!exists("snakemake")){
  setwd("../../..")
  source("scripts/util/paper/parse_snakemake.R")
  parse_snakemake(rule = "figure_2_statistics_other_classifiers")
}

#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")

# Source utilities
source('scripts/util/Plotting/plot_auc.R')
source('scripts/util/paper/abbrev_cond.R')

RF.META.IN.PATHS = list(
  CGD = snakemake@input[["cgd_meta"]],#'Classification/cgd_random_forest_sample_meta_data_all.RDS',
  STAT1.GOF = snakemake@input[["stat1_meta"]],#'Classification/stat1_random_forest_sample_meta_data_all.RDS',
  FMF = snakemake@input[["fmf_meta"]],#'Classification/fmf_random_forest_sample_meta_data_all.RDS',
  Job = snakemake@input[["job_meta"]]#'Classification/job_random_forest_sample_meta_data_all.RDS'
)


## The LOO CV results for each patient for each condition-based random forest classifier
HI.CONDITION.IN.PATHS = list(
  CGD = snakemake@input[["cgd_res"]],#'Classification/results/cgd_rf_results_all.RDS',
  STAT1.GOF = snakemake@input[["stat1_res"]],#'Classification/results/stat1_rf_results_all.RDS',
  FMF = snakemake@input[["fmf_res"]],#'Classification/results/fmf_rf_results_all.RDS',
  Job = snakemake@input[["job_res"]]#'Classification/results/job_rf_results_all.RDS'
)

# Supplemental Figure 4f -- condition-specific classifiers
results = lapply(HI.CONDITION.IN.PATHS, readRDS) ## Extract the condition-specific classifier results 


metas = lapply(RF.META.IN.PATHS, readRDS) ## Extact the meta data assocaited with each condition-specific classifier

## List the condition groups for each classifier
condition.groups = list(CGD = c('XCGD', '47CGD'),
                        STAT1.GOF = 'STAT1 GOF',
                        FMF = 'FMF',
                        Job = 'Job')

## Create a name conversion map to make the data types underlying each classifier more clear
conversion = c("microarray.modules" = 'Gene modules', 
               "tbnks" = 'CBCs + Lymphocyte Phenotyping',
               "cbcs" = 'CBCs',
               "somalogic.modules" = 'Protein modules', 
               "all.modules.with.tbnks" = 'Modules + CBCs', 
               "all.modules.plus.grey.with.tbnks" = 'Modules + CBCs + Grey Proteins')

## Insantiate a function to get the AUC associated with each classifier and each condition
get_aucs = function(result, meta, condition.group) {
  ## Get the condition associated with each patient
  conditions = meta[rownames(result), 'condition']
  apply(result, 2, function(x) {
    ## Get the ROC curve associated with each classifier
    roc = get_roc(x = x, y = conditions, pos = condition.group)
    ## Get the AUC of that ROC curve
    get_auc(roc)
  })
}

## Run the function on each of the condition-specific classifier results (and simplify into a matrix)
aucs = mapply(get_aucs, results, metas, condition.groups, SIMPLIFY = T)

## Create a data frame holding the AUCs for each classifier, and melt it
df = as.data.frame(aucs) %>% 
  tibble::rownames_to_column(var = 'classifier') %>%
  mutate(classifier = conversion[classifier]) %>%
  mutate(classifier = factor(classifier, levels = conversion)) %>%
  melt()

df <- df %>%
        filter(grepl("Grey", classifier))

R Markdown Snakemake ggplot2 dplyr tidyr ggrepel ggpubr reshape2 gridExtra From line 13 of Figure_2_Statistics/supplemental_figure_2_other_classifiers_table.Rmd

print("AUCs")
print(df)

R Markdown From line 99 of Figure_2_Statistics/supplemental_figure_2_other_classifiers_table.Rmd

table(metas$CGD$condition %in% c("XCGD", "47CGD"))

R Markdown From line 109 of Figure_2_Statistics/supplemental_figure_2_other_classifiers_table.Rmd

table(metas$STAT1.GOF$condition == "STAT1 GOF")

R Markdown From line 115 of Figure_2_Statistics/supplemental_figure_2_other_classifiers_table.Rmd

table(metas$FMF$condition == "FMF")

R Markdown From line 120 of Figure_2_Statistics/supplemental_figure_2_other_classifiers_table.Rmd

table(metas$Job$condition == "Job")

R Markdown From line 125 of Figure_2_Statistics/supplemental_figure_2_other_classifiers_table.Rmd

library(reshape2)
library(dplyr)
library(tibble)
library(Biobase)

# Set paths


#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project/")
SOMALOGIC_MODULE_IN_PATH <- snakemake@input[["soma_mods"]]#"Data/Somalogic/analysis_output/wgcna_results/modules.rds"
MICROARRAY_MODULE_IN_PATH <- snakemake@input[["array_mods"]]#"Data/Microarray/analysis_output/WGCNA/modules.rds"

soma_mods <- readRDS(SOMALOGIC_MODULE_IN_PATH)
array_mods <- readRDS(MICROARRAY_MODULE_IN_PATH)

source("scripts/util/Plotting/tbnk_featurename_replace.R")

soma_mods <- soma_mods %>% enframe(value = "module_color", name = "feature") %>%
        mutate(module_name = replace_mod_names_single_type(module_color, sheet = "PM")) %>%
        select(feature, module_name, module_color )

array_mods <- array_mods %>% enframe(value = "module_color", name = "feature") %>%
        mutate(module_name = replace_mod_names_single_type(module_color, sheet = "TM")) %>%
        select(feature, module_name, module_color)

## Limma DE statistics
RESULTS.IN.PATHS = list(
  `protein modules` = snakemake@input[["soma_mods_res"]],#'Data/Somalogic/analysis_output/differential_expression/somalogic_modules_DE_results.rds',
  `protein features` = snakemake@input[["soma_feat_res"]],#'Data/Somalogic/analysis_output/differential_expression/somalogic_features_DE_results.rds',
  `gene modules` = snakemake@input[["array_mods_res"]],#'Data/Microarray/analysis_output/differential_expression/microarray_modules_DE_results.rds',
  `gene features` = snakemake@input[["array_feat_res"]],#'Data/Microarray/analysis_output/differential_expression/microarray_features_DE_results.rds',
  `cbcs and tbnks` = snakemake@input[["tbnk_res"]]#'Data/TBNK/analysis_output/differential_expression/tbnks_features_DE_results.rds'
)

## Somalogic eset
PROTEIN.ESET.IN.PATH = snakemake@input[["soma_data"]]#'Data/Somalogic/data_analysis_ready/analysis_ready_sample_level_training_somalogic.rds'

## Tables out paths
TABLE.OUT.PATHS = list(
  `protein modules` = snakemake@output[["soma_mods"]],#'Paper_1_Figures/Figure_2_Tables/protein_modules_DE_results.txt',
  `protein features` = snakemake@output[["soma_feat"]],#'Paper_1_Figures/Figure_2_Tables/protein_features_DE_results.txt',
  `gene modules` = snakemake@output[["array_mods"]],#'Paper_1_Figures/Figure_2_Tables/gene_modules_DE_results.txt',
  `gene features` = snakemake@output[["array_feat"]],#'Paper_1_Figures/Figure_2_Tables/gene_features_DE_results.txt',
  `cbcs and tbnks` = snakemake@output[["tbnk"]]#'Paper_1_Figures/Figure_2_Tables/cbc_and_tbnks_DE_results.txt'
)

# Load data
results = lapply(RESULTS.IN.PATHS, readRDS)
somalogic = readRDS(PROTEIN.ESET.IN.PATH)

# For each data type
results.long = lapply(names(results), function(data.type) {

  # Get the data for that data type
  result = results[[data.type]]

  # Decide whether the data type is a module or feature
  if(grepl('module', data.type)) {
    feature.header = 'module'
  } else {
    feature.header = 'feature'
  }

  # For each statistic
  stats = lapply(names(result$versus.healthy), function(stat) {

    # Convert statistics from wide to long
    df = melt(result$versus.healthy[[stat]])

    # Rename the columns
    colnames(df) = c(feature.header, 'condition', stat)

    # Add the data tpye
    df$data.type = data.type

    # Select the desired columns
    df = df %>% select(!!feature.header, condition, data.type, !!stat)
    return(df)
  })

  # Name the long form statistics after the data tpyes
  names(stats) = names(result$versus.healthy)

  # Combine the data frames to put each statistic into one data frame
  result.long = stats[[1]]
  for(stat in stats[2:length(stats)]) {
    result.long = result.long %>% right_join(stat, by = c(feature.header,'condition','data.type'))
  }

  return(result.long)
})

names(results.long) = names(results)

# Add the feature meta data from the proteins to the proteomic data frame as the feature names are not standardized like genes
protein.features = results.long$`protein features`
f = fData(somalogic)
#protein.features = cbind(protein.features, f)
f = f %>% mutate(feature = make.names(Target))
protein.features = left_join(protein.features, f)
results.long$`protein features` = protein.features

results.long$`protein features` <- right_join(soma_mods, results.long$`protein features` )
results.long$`gene features` <- right_join(array_mods, results.long$`gene features` )

results.long$`protein modules` <- results.long$`protein modules` %>%
        rename(module_color = module) %>%
        mutate(module_name = replace_mod_names_single_type(as.character(module_color), sheet = "PM")) %>%
        select(module_name, module_color, everything())

results.long$`gene modules` <- results.long$`gene modules` %>%
        rename(module_color = module) %>%
        mutate(module_name = replace_mod_names_single_type(as.character(module_color), sheet = "TM")) %>%
        select(module_name, module_color, everything())

results.long <- lapply(results.long, function(dat){
  dat %>% filter(!condition %in% c("AI", "PID", "TERT.TERC"))
})

# Output tables to text files
for(data.type in names(results.long)) {
  result.long = results.long[[data.type]]
  file.path = TABLE.OUT.PATHS[[data.type]]
  write.table(result.long, file = file.path, sep = '\t', row.names = F, col.names = T)
}

R dplyr reshape2 tibble Biobase From line 2 of Figure_2_Tables/figure_2_tables_version_2.R

library(tidyverse)
library(cowplot)
library(ggraph)
library(tidygraph)
library(ggpubr)
library(ggrepel)

#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")

JIVE.PC.IN.PATH <- snakemake@input[["jive_pcs"]]#"Integration_output/jive/subject/prcomp_list.rds"
JIVE.IN.PATH <- snakemake@input[["jive"]]#"Integration_output/jive/subject/jive.rds"
COND.GROUPS.IN.PATH <- snakemake@input[["cond_groups"]]#"Reference/condition_groups.rds"

JPC.FIG.OUT.PATH <- snakemake@output[["jpc_scatter_box"]]#"Paper_1_Figures/Figure_3/jive_scatter_plus_boxplots_colorschemes.pdf"
MAD.PLOT.OUT.PATH <- snakemake@output[["mad_plot"]]#"Paper_1_Figures/Figure_3/mad_plot.pdf"

source("scripts/util/paper/abbrev_cond.R")

prcomp.list <- readRDS(JIVE.PC.IN.PATH)
jive <- readRDS(JIVE.IN.PATH)
pdat <- jive$pdat
joint <- prcomp.list$joint$x

disease_cat <- readRDS(COND.GROUPS.IN.PATH)

stopifnot(identical(rownames(joint), pdat$patient_id))

joint <- joint %>% 
  as.data.frame() %>%
  bind_cols(pdat) %>% 
  mutate(cond.abbrev = abbrev_cond(condition)) %>%
  mutate(cond.grouped = group_cond(condition))


#Inspired by this post
#https://stackoverflow.com/questions/23463324/r-add-centroids-to-scatter-plot

joint_centroids <- joint %>% group_by(condition) %>%
        summarise(mean_pc1 = mean(PC1), mean_pc2 = mean(PC2),
                  med_pc1 = median(PC1), med_pc2 = median(PC2),
                  mad_pc1 = mad(PC1), mad_pc2 = mad(PC2),
                  sd_pc1 = sd(PC1), sd_pc2 = sd(PC2),
                  sem_pc1 = sd(PC1) / sqrt(n()), sem_pc2 = sd(PC2) / sqrt(n()),
                  n = n())

joint <- left_join(joint, joint_centroids)

joint <- left_join(joint, disease_cat)
joint <- joint %>%
        mutate(cond_group = as.character(cond_group)) %>%
        mutate(cond_group = factor(cond_group, 
                                   levels = c("Healthy", "AI", "TERT.TERC", "PID"))) %>%
        mutate(cond_group = replace(cond_group, condition == "Healthy", "Healthy"))

joint_sub <- joint %>% select(mean_pc1, mean_pc2, cond.abbrev, sd_pc1, sd_pc2, cond_group, n, mad_pc1, mad_pc2) %>% distinct()

joint_sub <- joint_sub %>%
        filter(n > 3)

pca.plot.points.connected.ellipse <- 
  ggplot(data = joint, aes(color = cond_group, fill= cond_group)) +
  geom_text(aes(x = PC1, y = PC2, label = cond.abbrev), size = 2.5) +
  geom_point(data = joint_sub, aes(x=mean_pc1, y=mean_pc2),size=5)+
  geom_segment(aes(x=mean_pc1, y=mean_pc2, xend=PC1, yend=PC2), alpha = .2) +
  geom_text_repel(data = joint_sub, aes(x=mean_pc1, y=mean_pc2, label = cond.abbrev),size=5)+
  theme_bw() +
  xlim(c(-47, 40)) +
  ylim(-25, 50) +
  theme(legend.position = "none") +
  theme(axis.title.y = element_blank(), axis.title.x = element_blank(), axis.text.x = element_blank()) +
  theme(plot.margin = unit(c(0, 0, 0, 0), "cm"))

pc.medians <-
  joint %>%
  group_by(cond.abbrev) %>%
  summarise(pc1.median = median(PC1), pc2.median = median(PC2))




#pc1.order <- pc.medians$cond.abbrev[order(-pc.medians$pc1.median)]
pc1.order <- pc.medians$cond.abbrev[order(pc.medians$pc1.median)]
pc1.order <- c("Healthy", pc1.order[pc1.order != "Healthy"])
joint$cond.abbrev <- factor(joint$cond.abbrev, levels = pc1.order)

#keep same color scheme as other plots
#pc1.box <- 
#  ggplot(joint, aes(x = cond.abbrev, y = PC1, color = cond.grouped)) +
#  geom_boxplot(outlier.shape = NA) +
#  geom_jitter() + 
#  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
#  theme_bw() +
#  coord_flip() + 
#  theme(axis.title.x.bottom = element_blank(), 
#        axis.title.y = element_blank(), 
#        legend.position = "none")

pc1.box <- 
  ggplot(joint, aes(x = cond.abbrev, y = PC1)) +
  geom_boxplot(outlier.shape = NA, aes(fill = cond_group)) +
  geom_jitter() + 
  theme_bw() +
  coord_flip() + 
  ylim(c(-47, 40)) +
  theme(legend.position = "none") +
  stat_compare_means(ref.group = "Healthy", hide.ns = TRUE, label = "p.signif", color = "red", label.y = -45) +
  geom_vline(xintercept = 1.5) +
  xlab("Condition")

#pc1.box

pc2.order <- pc.medians$cond.abbrev[order(pc.medians$pc2.median)]
pc2.order <- c("Healthy", pc2.order[pc2.order != "Healthy"])
joint$cond.abbrev <- factor(joint$cond.abbrev, levels = pc2.order)

#pc2.box <- 
#  ggplot(joint, aes(x = cond.abbrev, y = PC2, color = cond.grouped)) +
#  geom_boxplot(outlier.shape = NA) +
#  geom_jitter() + 
#  theme_bw() +
#  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
#  #scale_x_discrete(position = "top") +
#  theme(axis.title.y.left = element_blank(), 
#        axis.title.x = element_blank(), legend.position = "none")

pc2.box <- 
  ggplot(joint, aes(x = cond.abbrev, y = PC2)) +
  geom_boxplot(outlier.shape = NA, aes(fill = cond_group)) +
  geom_jitter() + 
  ylim(-25, 50) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .4)) +
  #coord_flip() + 
  theme(legend.position = "none", axis.title.y = element_blank(), axis.text.y = element_blank()) +
  stat_compare_means(ref.group = "Healthy", hide.ns = TRUE, label = "p.signif", color = "red", label.y = -23) +
  geom_vline(xintercept = 1.5) +
  xlab("Condition")

#pdf("Paper_1_Figures/Figure_3/jive_boxplots.pdf")
#print(pc1.box)
#print(pc2.box)
#dev.off()

p1 <- cowplot::plot_grid(pca.plot.points.connected.ellipse, pc2.box, pc1.box, 
                         align = "hv", rel_heights = c(1, .7), rel_widths = c(1, .4))

pdf(JPC.FIG.OUT.PATH, height = 10, width = 16)
print(p1)
dev.off()

mad_plot <- 
  ggplot(data = joint_sub, aes(color = cond_group)) +
  geom_abline(slope = 1, intercept = 0) + 
  geom_point(data = joint_sub, aes(x=mad_pc1, y=mad_pc2),size=5)+
  geom_text_repel(data = joint_sub, aes(x=mad_pc1, y=mad_pc2, label = cond.abbrev),size=5)+
  theme_bw() +
  xlab("MAD jPC1") +
  ylab("MAD jPC2") +
  xlim(c(0, 20)) + 
  ylim(c(0, 20)) + 
  scale_color_manual(values = scales::hue_pal()(4)[c(1,2,4)]) +
  geom_abline(slope = 1, intercept = 0) + 
  theme(legend.position = "none")
  #theme(axis.title.y = element_blank(), axis.title.x = element_blank(), axis.text.x = element_blank()) +

ggsave(plot = mad_plot, filename = MAD.PLOT.OUT.PATH, height = 4.5, width = 4.5)

R ggplot2 tidyverse cowplot ggrepel ggpubr ggraph tidygraph From line 1 of Figure_3/3a_PCA_with_boxplots.R

library(tidyverse)
library(r.jive)


#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")

JIVE.PC.PATH <- snakemake@input[["jive_pcs"]]#"Integration_output/jive/subject/prcomp_list.rds"
JIVE.PATH <- snakemake@input[["jive"]]#"Integration_output/jive/subject/jive.rds"

FIG.OUT.PATH <- snakemake@output[[1]]#"Paper_1_Figures/Figure_3/jive_var_exp.pdf"

prcomp.list <- readRDS(JIVE.PC.PATH)
jive <- readRDS(JIVE.PATH)

#std dev of each pc
prcomp.list$joint$sdev

#The variance explained plot as done by the package
#showVarExplained(jive)
#Calculating the variance explained
#see r.jive::showVarExplained
result <- jive
l <- length(result$data)
VarJoint = rep(0, l)
for (i in 1:l) VarJoint[i] = norm(result$joint[[i]], type = "F")^2/norm(result$data[[i]], 
    type = "F")^2
VarIndiv = rep(0, l)
for (i in 1:l) VarIndiv[i] = norm(result$individual[[i]],
    type = "F")^2/norm(result$data[[i]], type = "F")^2
VarResid = 1 - VarJoint - VarIndiv


#Put variance explained into data frame


dat <- data.frame(Joint = VarJoint, Individual = VarIndiv, Residual = VarResid,
                  data.type = c("WB Transcriptome", "Serum Proteins")) %>% 
  gather(key = component, value = var.explained, -data.type)

dat$label <- vector("character", nrow(dat))

#make label that contains both joint, individual residual and microarray/somalogic
for(i in seq_len(nrow(dat))){
  if(dat$component[[i]] != "Joint"){
    print(i)
    dat$label[[i]] <- paste(dat$data.type[[i]], dat$component[[i]])
  }else{
    dat$label[[i]] <- as.character(dat$component)[[i]]
  }
}

dat$label <- factor(dat$label, 
                    levels = c( "Serum Proteins Residual", "Serum Proteins Individual", "WB Transcriptome Residual", "WB Transcriptome Individual", "Joint"))

#The plot that goes into the figure
pdf(FIG.OUT.PATH, height =1.5, width = 6)
ggplot(dat, aes(x = data.type, y = var.explained, fill = label)) +
  geom_col() +
  theme_classic() +
  coord_flip() +
  scale_fill_manual(values = c("WB Transcriptome Residual" = "lightblue2", 
                               "WB Transcriptome Individual" = "skyblue3",
                               "Serum Proteins Residual" = "coral1",
                               "Serum Proteins Individual" = "coral3",
                               "Joint" = "lightsteelblue4")) +
ylab("Variance\nExplained")# +
#theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

dev.off()

R ggplot2 tidyverse r.jive From line 1 of Figure_3/3b_jive_var_expl.R

library(ggplot2)

FIG.OUT.PATH <- snakemake@output[[1]]

x <- rnorm(500, 0, 3)
y <- x + rnorm(500, 0, 1)

mat <- cbind(x,y)
pca <- prcomp(mat)
pcs <- pca$x

s <- svd(mat)
U <- s$u
V <- s$v
D <- diag(s$d) ##turn it into a matrix

DV <- sqrt(D) %*% t(V)

dat = data.frame(x=x, y = y)

pdf(FIG.OUT.PATH, height = 4, width =4)
ggplot(dat, aes(x = x, y = y)) + 
  geom_point(color = "steelblue") + 
  theme_classic() + 
  theme(axis.ticks = element_blank(), axis.text = element_blank()) +
  geom_segment(aes(x=0, xend=DV[1,1], y=0, yend=DV[1,2]), size = 2, color = "darkblue",
               arrow = arrow(length = unit(0.5, "cm"))) + 
  geom_segment(aes(x=0, xend=DV[2,1], y=0, yend=DV[2,2]), size = 2, color = "blue",
               arrow = arrow(length = unit(0.5, "cm")))

ggplot(dat, aes(x = x, y = y)) + 
  geom_point(color = "red") + 
  theme_classic() + 
  theme(axis.ticks = element_blank(), axis.text = element_blank()) +
  geom_segment(aes(x=0, xend=DV[1,1], y=0, yend=DV[1,2]), size = 2, color = "darkred",
               arrow = arrow(length = unit(0.5, "cm"))) + 
  geom_segment(aes(x=0, xend=DV[2,1], y=0, yend=DV[2,2]), size = 2, color = "salmon",
               arrow = arrow(length = unit(0.5, "cm")))
dev.off()

R ggplot2 Salmon From line 1 of Figure_3/figure3_schematic.R

library(tidyverse)
library(cowplot)
library(ggpubr)

#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")
ALL.SUBJECT.PATH <- snakemake@input[["all_subj"]]#"Integration_output/jive/subject/prcomp_list.rds"
HEALTHY.ONLY.PATH <- snakemake@input[["healthy_only"]]#"Integration_output/jive/subject_onlyHealthy/prcomp_list.rds"
NO.HEALTHY.PATH <- snakemake@input[["no_healthy"]]#"Integration_output/jive/subject_noHealthy/prcomp_list.rds"

PLOT_OUT_PATH <- snakemake@output[[1]]#"Paper_1_Figures/Figure_3/jive_pc_comparison.pdf"

path.list <- list(`All Subjects` = ALL.SUBJECT.PATH, 
                  `Only Healthy` = HEALTHY.ONLY.PATH, 
                  `No Healthy` = NO.HEALTHY.PATH)

prcompLL <- lapply(path.list, readRDS)

joint.list <- lapply(prcompLL, function(prcompL){
  pca <- prcompL[["joint"]]

  pca[["x"]]
})

names(joint.list) <- gsub("No Healthy", "Excluding Healthy", names(joint.list))
names(joint.list) <- gsub("All Subjects", "Including All Subjects", names(joint.list))

#This function takes of the jive categories and plots the desired pc
# Scatterplot with correlation p values

single_plot <- function(jive.cat1, jive.cat2, PC){
  x <- joint.list[[jive.cat1]][, PC]
  y <- joint.list[[jive.cat2]][, PC]

  intersecting.pats <- intersect(names(x), names(y))
  x <- x[intersecting.pats]
  y <- y[intersecting.pats]

  #flip direction if anticorrelated- directions of PC's are arbitrary
  if(cor(x, y) <= 0){
    y <- -y
  }

  dat <-data.frame(x =x, y = y)
  p <- ggplot(dat, aes(x =x, y=y))+
    geom_point()+
    xlab(jive.cat1) +
    ylab(jive.cat2) + 
    stat_smooth(method = "lm", se = FALSE) +
    ggtitle(PC) + 
    theme_bw() +
    stat_cor()
  return(p)
}

#Creat PC1-3 plot objects to be printed later

pc1.plotlist <- list(single_plot("Including All Subjects", "Excluding Healthy", "PC1"),
                     single_plot("Including All Subjects", "Only Healthy", "PC1"))




#final plot used in figure

p <- plot_grid(plotlist = pc1.plotlist, ncol = 2)
ggsave(plot = p, PLOT_OUT_PATH, height = 4)

R ggplot2 tidyverse cowplot ggpubr From line 1 of Figure_3/jive_pc_comparison.R

library(tidyverse)
library(gridExtra)

#If there are issues with this script it is probably because I changed something with the featurename replacment


JIVE.PC.PATH <- snakemake@input[["jive_pcs"]]
TBNK.PATH <- snakemake@input[["tbnk"]]
SOMA.PATH <- snakemake@input[["soma_mod_scores"]]
ARRAY.PATH <- snakemake@input[["array_mod_scores"]]
#setwd("../../..")
#JIVE.PC.PATH <- "Pipeline_out/Integration_output/jive/subject/prcomp_list.rds"
#TBNK.PATH <- "Pipeline_out/Data/TBNK/data_analysis_ready/tbnk_training_subject_level_eset.rds"
#SOMA.PATH <- "Pipeline_out/Data/Somalogic/analysis_output/wgcna_results/scores_subject_level.rds"
#ARRAY.PATH <- "Pipeline_out/Data/Microarray/analysis_output/WGCNA/array_subject_scores.rds"


FIG.OUT.PATH <- snakemake@output[["figure"]]
TAB.OUT.PATH <- snakemake@output[["table"]]
#FIG.OUT.PATH <- "Pipeline_out/Paper_1_Figures/Figure_3/jive_pc_cor.pdf"

prcomp.list <- readRDS(JIVE.PC.PATH)
tbnk.eset <- readRDS(TBNK.PATH)
soma.modules <- readRDS(SOMA.PATH)
array.modules <- readRDS(ARRAY.PATH)

source("scripts/util/Plotting/tbnk_featurename_replace.R")
#Get only the first three PC's of the joint. All other PC's essentially have eigen values of 0

joint <- prcomp.list$joint$x[, 1:3]

#Put expressionsets into list and make sure that the patient_id are sampleNames/rownames of the expression matrix
eset.list <- list(protein = soma.modules, gene = array.modules, tbnk = tbnk.eset)
eset.list <- lapply(eset.list, function(eset){
  sampleNames(eset) <- eset[["patient_id"]]
  eset
})

do_cortest <- function(x, y, method){
  intersection <- intersect(names(x), names(y))

  x <- x[match(intersection, names(x))]
  y <- y[match(intersection, names(y))]

  stopifnot(all.equal(names(x), names(y)))

  cor.test(x, y, method = method)
}


get_cor_dat<- function(eset, joint, method){
  intersection <- intersect(rownames(joint), eset$patient_id)
  mat <- exprs(eset)
  mat <- mat[ ,match(intersection, eset$patient_id)]
  mat <- mat[complete.cases(mat),]
  mat <- t(mat)

  joint <- joint[match(intersection, rownames(joint)),]
  stopifnot(all.equal(rownames(mat), rownames(joint)))

  lapply(colnames(joint), function(PC){
    lapply(colnames(mat), function(feature){
      x <- joint[, PC]
      y <- mat[, feature]
      result <- do_cortest(x, y, method = method)
      data.frame(cor = result$estimate, p = result$p.value, PC = PC, feature = feature, stringsAsFactors = FALSE)
    }) %>% bind_rows()
  }) %>% bind_rows()
}

cordat_list <- lapply(eset.list, get_cor_dat, joint = joint, method = "spearman")

cordat_list[[1]] <- cordat_list[[1]] %>% mutate(feature2 = replace_mod_names_single_type(feature, "PM"))
cordat_list[[2]] <- cordat_list[[2]] %>% mutate(feature2 = replace_mod_names_single_type(feature, "TM"))
cordat_list[[3]] <- cordat_list[[3]] %>% mutate(feature2 = feature)

cordat <- bind_rows(cordat_list, .id = "feature_type")

cordat <- cordat %>%
        mutate(feature2 = replace(feature2, is.na(feature2), feature[is.na(feature2)])) %>% 
        mutate(p.adj = p.adjust(p, method = "fdr")) %>%
        mutate(asterisk = ifelse(p.adj < .05, "*", ""))

cordat$feature2 <- factor(cordat$feature2)

levels(cordat$feature2) <- replace_tbnk_names(levels(cordat$feature2))

cordat <- cordat %>% 
        mutate(feature_type2 = replace(feature_type, feature_type == "tbnk",
                                       tbnk_groups(feature2[feature_type == "tbnk"], 
                                                   "new name")))

lev_order <- c("TM", "PM", "Innate", "Lymphocytes", "RBC & PLT")
cordat <- cordat %>%
        mutate(feature_type2 = gsub("protein", "PM", feature_type2)) %>%
        mutate(feature_type2 = gsub("gene", "TM", feature_type2)) %>%
        mutate(feature_type2 = factor(feature_type2, levels = lev_order))

cordat <- cordat %>%
        mutate(PC = paste0("j", PC))


#write the table----
cordat %>% select(-c("asterisk", "feature_type", "feature")) %>%
        rename(feature_type = feature_type2, feature = feature2) %>%
        write_csv(TAB.OUT.PATH)

#plot the figure ---
p <- ggplot(cordat %>% filter(PC %in% c("jPC1", "jPC2")), aes(y = PC, x = feature2)) + geom_tile(aes(fill = cor)) +
  scale_radius(limits = c(0,1)) + 
  scale_fill_gradient2(low = "blue", mid = "white", 
                        high = "red", limits = c(-1, 1)) + 
  geom_text(aes(label = asterisk), color = "black") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
        strip.background.y = element_blank(), strip.text.y = element_blank(),
        axis.line = element_blank(), 
        #strip.background.x = element_rect(colour="black", fill="grey90"),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  facet_grid(1 ~ feature_type2, scales = "free", space = "free") +
  ylab("") + xlab("")
  #theme(axis.line = element_blank(), 
  #      axis.ticks = element_blank(), 
  #      axis.title = element_blank(),
  #      legend.position = "left") +

pdf(FIG.OUT.PATH, width = 7, height = 3.7)
print(p)
dev.off()

R ggplot2 tidyverse gridExtra From line 1 of Figure_3/joint_tbnk_module_cor.R

library(ggplot2)
library(ggpubr)
library(dplyr)

JIVE.PC.IN.PATH <- snakemake@input[["jive_pcs"]]#"Integration_output/jive/subject/prcomp_list.rds"
JIVE.IN.PATH <- snakemake@input[["jive"]]#"Integration_output/jive/subject/jive.rds"

FIG.OUT.PATH <- snakemake@output[[1]]#"Paper_1_Figures/Figure_3/jpc1_hc_vs_all_try_colorschemes.pdf"
#Wilcoxon of Healthy vs Not Healthy pc1

source("scripts/util/paper/abbrev_cond.R")

prcomp.list <- readRDS(JIVE.PC.IN.PATH)
jive <- readRDS(JIVE.IN.PATH)
pdat <- jive$pdat
joint <- prcomp.list$joint$x

stopifnot(identical(rownames(joint), pdat$patient_id))

joint <- joint %>% 
  as.data.frame() %>%
  bind_cols(pdat) %>% 
  mutate(cond.abbrev = abbrev_cond(condition)) %>%
  mutate(cond.grouped = group_cond(condition))

#joint <- joint %>% mutate(Healthy = condition == "Healthy")
joint <- joint %>% mutate(Healthy = ifelse(condition == "Healthy", "Healthy", "Disease"))
#file.remove("Paper_1_Figures/Figure_3/jpc1_hc_vs_all.pdf")

p <- ggplot(joint, aes(x = Healthy, y = PC1)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme_bw() +
  stat_compare_means() + xlab("") + ylab("jPC1")
ggsave(plot = p, filename = FIG.OUT.PATH, height = 3, width = 3)

R ggplot2 dplyr ggpubr From line 1 of Figure_3/jpc1_healthy_vs_disease.R

library(tidyverse)

if(!exists("snakemake")){
  setwd("../../..")
  source("scripts/util/paper/parse_snakemake.R")
  parse_snakemake(rule = "figure_3_jpc_feature_cor")
}

JIVE.PC.PATH <- snakemake@input[["jive_pca"]]
SOMA.PATH <- snakemake@input[["soma"]]
ARRAY.PATH <- snakemake@input[["array"]]


TAB.OUT.PATH <- snakemake@output[[1]]

prcomp.list <- readRDS(JIVE.PC.PATH)
soma.eset <- readRDS(SOMA.PATH)
array.eset <- readRDS(ARRAY.PATH)

#Get only the first three PC's of the joint. All other PC's essentially have eigen values of 0

joint <- prcomp.list$joint$x[, 1:3]
array_indiv = prcomp.list$array.ind$x[, 1:2]
soma_indiv = prcomp.list$soma.ind$x[, 1:2]

jive_pc_list <- list(joint = joint, transcriptome_individual = array_indiv, proteome_individual = soma_indiv)


#Put expressionsets into list and make sure that the patient_id are sampleNames/rownames of the expression matrix
eset.list <- list(protein = soma.eset, gene = array.eset)
eset.list <- lapply(eset.list, function(eset){
  sampleNames(eset) <- eset[["patient_id"]]
  eset
})

do_cortest <- function(x, y, method){
  intersection <- intersect(names(x), names(y))

  x <- x[match(intersection, names(x))]
  y <- y[match(intersection, names(y))]

  stopifnot(all.equal(names(x), names(y)))

  cor.test(x, y, method = method)
}


get_cor_dat<- function(eset, jive_pc_mat, method){
  intersection <- intersect(rownames(jive_pc_mat), eset$patient_id)
  mat <- exprs(eset)
  mat <- mat[ ,match(intersection, eset$patient_id)]
  mat <- mat[complete.cases(mat),]
  mat <- t(mat)

  jive_pc_mat <- jive_pc_mat[match(intersection, rownames(jive_pc_mat)),]
  stopifnot(all.equal(rownames(mat), rownames(jive_pc_mat)))

  lapply(colnames(jive_pc_mat), function(PC){
    lapply(colnames(mat), function(feature){
      x <- jive_pc_mat[, PC]
      y <- mat[, feature]
      result <- do_cortest(x, y, method = method)
      data.frame(cor = result$estimate, p = result$p.value, PC = PC, feature = feature, stringsAsFactors = FALSE)
    }) %>% bind_rows()
  }) %>% bind_rows()
}

cordat_list <- lapply(jive_pc_list, function(jive_pc_mat){
   lapply(eset.list, get_cor_dat, jive_pc_mat = jive_pc_mat, method = "pearson")
})

cordat <- lapply(cordat_list, function(sublist){
  bind_rows(sublist, .id = "input_data")
}) %>% bind_rows(.id = "jive_pc_type")

cordat <- cordat %>%
        mutate(input_data = gsub("protein", "proteomics", input_data)) %>%
        mutate(input_data = gsub("gene", "transcriptomics", input_data))

cordat <- cordat %>%
        mutate(tmp = paste(jive_pc_type, input_data)) %>%
        filter(!tmp %in% c("transcriptome_individual proteomics", 
                           "proteome_individual transcriptomics")) #%>%
        #select(-tmp)

cordat <- cordat %>% select(-tmp)

cordat$PC[cordat$jive_pc_type == "joint"] <- 
        paste0("j", cordat$PC[cordat$jive_pc_type == "joint"])

cordat$PC[cordat$jive_pc_type == "transcriptome_individual"] <- 
        paste0("transcriptome_i", cordat$PC[cordat$jive_pc_type == "transcriptome_individual"])

cordat$PC[cordat$jive_pc_type == "proteome_individual"] <- 
        paste0("proteome_i", cordat$PC[cordat$jive_pc_type == "proteome_individual"])

write_csv(cordat, TAB.OUT.PATH)

R Snakemake tidyverse From line 1 of Figure_3_Tables/jive_pc_feat_cor.R

library(tidyverse)

#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")

ENRICH.IN.PATH <- snakemake@input[["enrich"]]#"Integration_output/jive/subject/pc_enrich_dat_camera.rds"

TABLE.OUT.PATH <- snakemake@output[[1]]#"Paper_1_Figures/Figure_3_Tables/jive_pc_enrichment.csv"

all.dat <- readRDS(ENRICH.IN.PATH)
all.dat <- all.dat %>% filter(geneset.db != "tiss.general")

all.dat <- all.dat %>%
        filter(PValue < .05) %>%
        mutate(pca.data = replace(pca.data, pca.data == "array.ind", "transcriptome_individual")) %>%
        mutate(pca.data = replace(pca.data, pca.data == "soma.ind", "proteome_individual")) %>%
        filter(PC %in% c("PC1", "PC2"))

all.dat$PC[all.dat$pca.data == "joint"] <- 
        paste0("j", all.dat$PC[all.dat$pca.data == "joint"])

all.dat$PC[all.dat$pca.data == "transcriptome_individual"] <- 
        paste0("transcriptome_i", all.dat$PC[all.dat$pca.data == "transcriptome_individual"])

all.dat$PC[all.dat$pca.data == "proteome_individual"] <- 
        paste0("proteome_i", all.dat$PC[all.dat$pca.data == "proteome_individual"])


all.dat <- all.dat %>%
        mutate(in.data = replace(in.data, in.data == "array", "transcriptome")) %>%
        mutate(in.data = replace(in.data, in.data == "soma", "proteome"))

all.dat <- all.dat %>%
        rename(enrichment.input.data = in.data) %>%
        select(-pca.data) %>%
        select(geneset, everything())

write_csv(all.dat, TABLE.OUT.PATH)

R tidyverse From line 1 of Figure_3_Tables/jpc_enrichments.R

library(tidyverse)

#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")

JIVE.PC.IN.PATH <- snakemake@input[["jive_pca"]]#"Integration_output/jive/subject/prcomp_list.rds"
JIVE.IN.PATH <- snakemake@input[["jive"]]#"Integration_output/jive/subject/jive.rds"

TABLE.OUT.PATH <- snakemake@output[[1]]#"Paper_1_Figures/Figure_3_Tables/jive_pcs.csv"

prcomp.list <- readRDS(JIVE.PC.IN.PATH)
jive <- readRDS(JIVE.IN.PATH)
pdat <- jive$pdat

pdat <- pdat %>% select(patient_id, condition)

joint <- prcomp.list$joint$x[, 1:2]
colnames(joint) <- paste0("j", colnames(joint))

gene <- prcomp.list$array.ind$x[, 1:2]
colnames(gene) <- paste0("transcriptome_i", colnames(gene))

protein <- prcomp.list$soma.ind$x[, 1:2]
colnames(protein) <- paste0("proteome_i", colnames(protein))

pcs <- do.call(cbind, list(joint, gene, protein))

stopifnot(identical(rownames(pcs), pdat$patient_id))

dat <- 
  bind_cols(pdat, as.data.frame(pcs))

write_csv(dat, TABLE.OUT.PATH)

R tidyverse From line 1 of Figure_3_Tables/jpc_scores.R

library(ggplot2)
library(reshape2)
library(cowplot)
library(ggpubr)
library(dplyr)
library(tidyr)
library(ggpubr)

# Set paths
HI.IN.PATH = snakemake@input[[1]]#'Classification/results/healthy_rf_results_all.RDS'
META.IN.PATH = snakemake@input[[2]]#'Classification/random_forest_sample_meta_data.RDS'
PC1.IN.PATH = snakemake@input[[3]]#"Integration_output/jive/subject/prcomp_list.rds"
FEATURE.GVI.PVALS.IN.PATH = snakemake@input[[4]]#"Classification/results/healthy_rf_pvals_all.RDS"
TRANSCRIPTIONAL.SURROGATE.SIGNATURE.ENRICHMENTS.IN.PATH = snakemake@input[[5]]#"Classification/transcriptional_surrogates/surrogate_enrichments.RDS"
DESIGN.MAT.IN.PATH <- snakemake@input[[6]]

FIGURE.4b.OUT.PATH = snakemake@output[[1]]#"Paper_1_Figures/Figure_4/4.b.pdf",
FIGURE.4cde.OUT.PATH = snakemake@output[[2]]#"Paper_1_Figures/Figure_4/4.cde.pdf"
FIGURE.4f.OUT.PATH = snakemake@output[[3]]#"Paper_1_Figures/Figure_4/4.f.pdf"
FIGURE.4g.OUT.PATH = snakemake@output[[4]]#"Paper_1_Figures/Figure_4/4.g.pdf"
FIGURE.4h.OUT.PATH = snakemake@output[[5]]#"Paper_1_Figures/Figure_4/4.h.pdf"

# We source utilities
source('scripts/util/Plotting/enrichments.R')
source('scripts/util/Plotting/plot_auc.R')
source('scripts/util/paper/abbrev_cond.R')
source('scripts/util/Groups/groups.R')

# Figure 4a -- Cartoon displaying classifier schema
## See box > users > Dylan Hirsch > Monogenic Project - Paper 1 > fig4.a.pptx

# Figure 4b -- AUC curve for classifier

## Load the healthy rf prediction results and the sample metadata
results = readRDS(HI.IN.PATH)
meta = readRDS(META.IN.PATH)

## Get the "all-features" classifier predictions, which we have been using for the Healthy Index
result = results$all.modules.plus.grey.with.tbnks

## Create the ROC curve from the HI
roc = get_roc(result, meta$condition, 'Healthy')

## Get the AUC associated with this ROC curve
auc = get_auc(roc)

## Round the AUC to 2 digits
auc = format(auc, digits = 2)

## Plot the ROC curve with the AUC displayed
p = ggplot(roc, aes(x = fpr, y = tpr)) + geom_line(color = 'black', show.legend = FALSE) + 
  geom_abline(slope = 1, linetype = 'dashed', color = 'grey') + 
  theme_bw() + geom_text(aes(x = .75, y = .25), size = 4, label = paste0('AUC: ', auc), show.legend = FALSE) + 
  xlab('False Positive Rate') + ylab('True Positive Rate') + 
  theme(axis.title.x = element_text(size = 8),
        axis.title.y = element_text(size = 8),
        axis.text.x = element_text(size = 8),
        axis.text.y = element_text(size = 8)
  )

## Save plot
ggsave(FIGURE.4b.OUT.PATH, p, device = 'pdf', height = 2, width = 2)

# Figure 4c -- healthy index barplots for each condition, arranged by condition supertype

## Load the healthy index and subject meta-data
results = readRDS(HI.IN.PATH)
meta = readRDS(META.IN.PATH)

## Get the conditions associated with each supertype
AI = util.get_ai()
PID = c(util.get_pid(), 'NEMO') # We put in NEMO manually because it shows up as non-PID in the database
Telo = util.get_tert_terc()

## Create a data frame with the HI and condition for each subject
df = data.frame(healthy.index = results$all.modules.plus.grey.with.tbnks, 
                condition = meta$condition) %>%
  mutate(condition = as.character(condition)) %>%
  mutate(group = condition %>% # Add in condition super-type
           replace(condition %in% AI, 'AI') %>%
           replace(condition %in% PID, 'PID') %>%
           replace(condition %in% Telo, 'Telo')) %>%
  mutate(group = factor(group, levels = c('Healthy','AI','Telo','PID'))) %>%
  mutate(condition = abbrev_cond(condition)) # We use the abbreviated condition names

## Compute the median healthy index for each condition
condition.median.healthy.indexes = df %>%
  group_by(condition) %>%
  summarise(condition.median.healthy.index = median(healthy.index))

## Add in the median healthy index for each condition to the original data frame
df = df %>%
  right_join(condition.median.healthy.indexes, by = 'condition') %>%
  arrange(as.numeric(group), desc(condition.median.healthy.index)) %>% # Sort by condition super-type and then median healthy index
  mutate(condition = factor(condition, condition %>% unique)) %>%
  mutate(condition = relevel(condition, abbrev_cond('Healthy'))) %>% # Make sure Healthy is the first level
  mutate(condition = factor(condition, levels = rev(levels(condition))))

## Plot the box plots
HI_max = max(df$healthy.index) + .01
p1 = ggplot(df, aes(x = condition, y = healthy.index, fill = group)) + 
  geom_boxplot(outlier.colour = NA) + 
  ylim(0, HI_max) +
  theme_bw() + 
  geom_jitter() + 
  coord_flip() + 
  stat_compare_means(ref.group = "Healthy", hide.ns = TRUE, label = "p.signif", color = "red", label.y = 0, size = 10) +
  theme(axis.text.x = element_text(size = 15),
        axis.title.x = element_blank(),
        axis.text.y = element_text(size = 15),
        axis.title.y = element_text(size = 15),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 15),
        legend.key.size = unit(2,"line"))

# Figure 4d -- density plot for major condition groups

## Load data
results = readRDS(HI.IN.PATH)
meta = readRDS(META.IN.PATH)

## Choose the conditions to plot
conditions = c('Healthy','47CGD','XCGD','Job','STAT1 GOF','FMF')

## Create a data frame with the HI and condition for each subject from the conditions of interest
df = data.frame(healthy.index = results$all.modules.plus.grey.with.tbnks, 
                condition = meta$condition) %>%
  mutate(condition = as.character(condition)) %>%
  filter(condition %in% conditions) %>%
  mutate(condition = abbrev_cond(condition)) %>%
  mutate(condition = factor(condition, levels = abbrev_cond(conditions)))

## Make the density plots
p2 = ggplot(df, aes(x = healthy.index, fill = condition)) + geom_density(alpha = .4) +
  ylab('Density') + xlab('Healthy Index') + xlim(0, HI_max) + 
  theme_bw() + theme(
    axis.text.x = element_text(size = 15),
    axis.title.x = element_blank(),
    axis.text.y = element_text(size = 15),
    axis.title.y = element_text(size = 15),
    legend.text = element_text(size = 15),
    legend.title = element_text(size = 15)
  )

# Figure 4e -- correlation between healthy index and PC1 score in healthy and conditions

## Load Joint PC1
jive = readRDS(PC1.IN.PATH)
PC1 = jive$joint$x[,1]

## Load healthy index
results = readRDS(HI.IN.PATH)
predictions = results$all.modules.plus.grey.with.tbnks
names(predictions) = rownames(results)

## Load patient metadata
meta = readRDS(META.IN.PATH)

## Get the patients that have a healthy index and joint PC1
ids = intersect(names(predictions), names(PC1))
PC1 = PC1[ids]
predictions = predictions[ids]
conditions = meta[ids, 'condition']
group = ifelse(conditions == "Healthy", "Healthy", "Disease")
group = factor(group, levels = c("Healthy", "Disease"))

## Create a data frame joining the two
df = data.frame(predictions, PC1, group)

## Create a scatter plot with regression lines showing the relationships between HI and PC1
p3 = ggplot(df, aes(x = predictions, y = PC1, color = group)) + ylab('PC1 Score') +
  xlab('Healthy Index') +
  geom_point() + geom_smooth(method = 'lm', formula = y ~ x, se = FALSE) +
  ylim(-50, 50) + xlim(0, HI_max) +
  stat_cor(label.y = c(30,45), label.x = c(0,0), show.legend = FALSE, size = 5) +
  theme_bw() + 
  theme(axis.title.x = element_text(size = 15),
        axis.text.x = element_text(size = 15),
        axis.text.y = element_text(size = 15),
        axis.title.y = element_text(size = 15),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 15))

## Plot Figure 4c-e
p = plot_grid(plotlist = list(p1, p2, p3), nrow = 3, align = 'v', rel_heights = c(10, 3, 6))
ggsave(FIGURE.4cde.OUT.PATH, p, height = 10, width = 10)

# Figure 4f -- Age relationships of top conditions 

## Load data
results = readRDS(HI.IN.PATH)
meta = readRDS(META.IN.PATH)

## Choose the conditions to plot
conditions = c('Healthy','47CGD','XCGD','Job','STAT1 GOF','FMF')

## Make a data frame with the healthy indexes, conditions, and ages of the patients from the conditions of interest

library(tidyverse)
df <- results %>%
        rownames_to_column(var = "patient_id") %>%
        select(patient_id, all.modules.plus.grey.with.tbnks) %>%
        rename(healthy.index = all.modules.plus.grey.with.tbnks) %>%
        left_join(meta) %>%
  filter(condition %in% conditions) %>%
  mutate(condition = as.character(condition)) %>%
  mutate(condition = abbrev_cond(condition)) %>%
  mutate(condition = factor(condition, levels = abbrev_cond(conditions)))

p = ggplot(df, aes(x = Age, y = healthy.index)) + geom_point() + 
  facet_wrap(~condition, ncol = 3, nrow = 2) + ylab('Healthy Index') +
  geom_smooth(method = 'lm', formula = y ~ x, se = FALSE) +
  ylim(0, .6) + xlim(0, 80) +
  stat_cor(label.x = 0, label.y = .55) +
  theme_bw() + 
  theme(axis.title.x = element_text(size = 15),
        axis.text.x = element_text(size = 15),
        axis.text.y = element_text(size = 15),
        axis.title.y = element_text(size = 15),
        strip.text = element_text(size = 15))

ggsave(FIGURE.4f.OUT.PATH, p, height = 5, width = 10)

# Figure 4g -- Bar charts of top features of the classifier

## Load the HI feature pvalues
results = readRDS(FEATURE.GVI.PVALS.IN.PATH)
p.vals = results$all.modules.plus.grey.with.tbnks
meta = readRDS(META.IN.PATH)

design_mat_list <- readRDS(DESIGN.MAT.IN.PATH)
design_mat <- design_mat_list$all.modules.plus.grey.with.tbnks

## Adjust the p-values using an FDR and convert to negative log10 pvalues
p.adjusted = p.adjust(p.vals, 'fdr')
neg.log10.p.adjusted = -1 * log10(p.adjusted)

## Create a data frame with the feature names, data type, and negative log 10 pvalues, just for features passing the FDR cutoff
df = data.frame(label = names(p.vals), p.adjusted = p.adjusted, neg.log10.pvals = neg.log10.p.adjusted) %>%
  filter(p.adjusted < .20) %>%
  select(-p.adjusted)

keep_feat <- df$label %>% as.character()

feat_mat <- design_mat[, keep_feat] 

feat_dat <- feat_mat %>%
        as.data.frame() %>%
        tibble::rownames_to_column(var = "patient_id") %>%
        gather(key = feat, value = value, -patient_id) %>%
        left_join(meta) %>%
        mutate(healthy = condition == "Healthy")

ttestf <- function(x) {
        t.test(value ~ healthy, paired = FALSE, data = x) %>% broom::tidy()
}
t_test_dat <- feat_dat %>%
        group_by(feat) %>%
        do(ttestf(.)) %>% ungroup()

feat_sign <- t_test_dat %>%
        mutate(t_sign = sign(statistic)) %>%
        select(feat, t_sign) %>%
        tibble::deframe()



df <- df %>%
        mutate(t_sign = feat_sign[match(label, names(feat_sign))])

df = df %>%
  mutate(data.type = label %>%
           as.character() %>%
           replace(., grepl('somalogic\\.grey\\.', .), 'Grey\nModule\nProteins') %>%
           replace(., grepl('somalogic\\.modules\\.', .), 'Protein\nModule\nScores') %>%
           replace(., grepl('tbnks\\.', .), 'CBC +\nLymphocyte\nPhenotyping')) %>%
  mutate(label = label %>%
           as.character() %>%
           gsub('somalogic\\.grey\\.', '', .) %>%
           gsub('somalogic\\.modules\\.', '', .) %>%
           gsub('microarray\\.modules\\.', '', .) %>%
           gsub('tbnks\\.', '', .))

## We now manually clean up the feature names one-by-one to make them look better when plotting
df = df %>%
  mutate(label = label %>%
           gsub('nk_cells_percent','NK Cells (%)', .) %>%
           gsub('nk_cells_abs','NK Cells (#)', .) %>%
           gsub('MIP.1a','MIP 1a', .) %>%
           gsub('purple','PM2', .) %>%
           gsub('Cathepsin.H','Cathepsin H', .) %>%
           gsub('IL.18.Ra','IL-18 Ra', .) %>%
           gsub('rdw','RDW', .) %>%
           gsub('LD78.beta','LD78 b', .))

## We order the features by negative log 10 pvalue
df = df %>% 
  arrange(neg.log10.pvals) %>%
  mutate(label = factor(label, levels = label)) %>%
  mutate(data.type = factor(data.type))

## We plot the feature p-values in a bar plot
p = ggplot(df, aes(y = neg.log10.pvals * t_sign, x = label, fill = data.type)) + 
  geom_bar(stat="identity") + theme_bw() + xlab('Parameter') + ylab('Negative log10 q-values') + coord_flip() +
  #scale_fill_manual(values = c('darkblue','steelblue', 'lightblue')) + 
  labs(fill = 'Data Type') +
  geom_hline(aes(yintercept = -log10(.20), linetype = 'FDR = .20'), color = 'gray', size = 1) +
  geom_hline(aes(yintercept = log10(.20), linetype = 'FDR = .20'), color = 'gray', size = 1) +
  scale_linetype_manual(values = 'dashed') +
  scale_fill_manual(values = c("seagreen4", "violetred", "plum1")) +
  facet_grid(t_sign~1, scales = "free_y", space = "free_y") +
  ylab("-log10(q) * direction") +
  theme_bw() +
  theme(axis.title.x = element_text(size = 10),
        axis.text.x = element_text(size = 10),
        axis.title.y = element_text(size = 10),
        axis.text.y = element_text(size = 10),
        legend.text = element_text(size = 10),
        legend.title = element_text(size = 10),
        strip.background = element_blank(),
        strip.text = element_blank()
  )

## We save the plot
ggsave(FIGURE.4g.OUT.PATH, p, height = 3, width = 5)

# Figure 4h -- barplots of enrichments for gene surrogate signatures of proteins
## We load the enrichments on the surrogate signatures
enrichments = readRDS(TRANSCRIPTIONAL.SURROGATE.SIGNATURE.ENRICHMENTS.IN.PATH)

## We use the enrichment bar plot utility function to create a barplot for two proteins of interest: SAA and MIP1a
p1 = make_enrichment_bar_plot(enrichment = enrichments$somalogic.modules.purple$positive) + 
  ggtitle('Proteomic Purple Module') + scale_fill_manual(values = c('tomato2', 'forestgreen', 'darkorchid2')) +
  theme_bw() +
  theme(axis.title.x = element_blank(),
        axis.text.x = element_text(size = 15),
        axis.title.y = element_text(size = 15),
        axis.text.y = element_text(size = 15),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 15),
        plot.title = element_text(size = 15))

p2 = make_enrichment_bar_plot(enrichment = enrichments$somalogic.grey.SAA$positive) +
  ggtitle('SAA') +
  scale_fill_manual(values = c('tomato2', 'darkorchid2')) +
  theme_bw() +
  theme(axis.title.x = element_text(size = 15),
        axis.text.x = element_text(size = 15),
        axis.title.y = element_text(size = 15),
        axis.text.y = element_text(size = 15),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 15),
        plot.title = element_text(size = 15))

## We put the plots together
p = plot_grid(plotlist = list(p1, p2), nrow = 2, align = 'v')

## And save them
ggsave(FIGURE.4h.OUT.PATH, p, height = 6, width = 10)

R ggplot2 tidyverse dplyr tidyr cowplot ggpubr reshape2 From line 2 of Figure_4/figure_4_version_6.R

knitr::opts_chunk$set(echo = TRUE)
#knitr::opts_knit$set(root.dir = normalizePath("../../.."))

R Markdown From line 10 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

library(Biobase)
library(dplyr)

R Markdown dplyr Biobase From line 16 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

source('scripts/util/Groups/groups.R')

R Markdown From line 22 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

# Sample meta data for samples run in the random forest
META.IN.PATH = snakemake@input[[1]] #"Classification/healthy_random_forest_sample_meta_data_all.RDS"
# The healthy index from the healthy random forest
INDEX.IN.PATH = snakemake@input[[2]] #"Classification/results/healthy_rf_results_all.RDS"

R Markdown From line 27 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

meta = readRDS(META.IN.PATH)
healthy.index = readRDS(INDEX.IN.PATH)

R Markdown From line 37 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

AI = util.get_ai()
PID = util.get_pid()
Telo = util.get_tert_terc()

R Markdown From line 43 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

df = data.frame(healthy.index = healthy.index$all.modules.plus.grey.with.tbnks, condition = meta$condition, 
                stringsAsFactors = FALSE)
df = df %>%
  filter(condition %in% c(AI, PID, Telo)) %>%
  mutate(group = condition %>%
           replace(.,. %in% AI, 'AI') %>%
           replace(.,. %in% PID, 'PID') %>%
           replace(.,. %in% Telo, 'Telo')) %>%
  mutate(condition = factor(condition, unique(condition))) %>%
  mutate(group = factor(group, c('AI','PID','Telo')))

R Markdown From line 50 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

kruskal.test(df$healthy.index, df$condition)

R Markdown From line 64 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

kruskal.test(df$healthy.index, df$group)

R Markdown From line 69 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

df.subset = df %>% filter(group == 'PID')
kruskal.test(df.subset$healthy.index, df.subset$condition)

R Markdown From line 74 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

df.subset = df %>% filter(group == 'AI')
kruskal.test(df.subset$healthy.index, df.subset$condition)

R Markdown From line 80 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

df.subset = df %>% filter(group == 'Telo')
kruskal.test(df.subset$healthy.index, df.subset$condition)

R Markdown From line 86 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

meta = readRDS(META.IN.PATH)
healthy.index = readRDS(INDEX.IN.PATH)

R Markdown From line 94 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

healthy.ages = meta$Age[meta$condition == "Healthy"]
case.ages = meta$Age[meta$condition != "Healthy"]
ks.test(healthy.ages, case.ages)

R Markdown From line 100 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

ages = c(healthy.ages, case.ages)
groups = factor(c(rep('healthy', length(healthy.ages)), rep('case', length(case.ages))), c('healthy','case'))
kruskal.test(c(healthy.ages, case.ages), groups)

R Markdown From line 107 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

index = readRDS(INDEX.IN.PATH)
meta = readRDS(META.IN.PATH)

R Markdown From line 116 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

index = index$all.modules.plus.grey.with.tbnks
sapply(unique(meta$condition), function(condition) {
  var(index[meta$condition == condition])
})

R Markdown From line 121 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

index = readRDS(INDEX.IN.PATH)
meta = readRDS(META.IN.PATH)

R Markdown From line 131 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

index = index$all.modules.plus.grey.with.tbnks

median.age = median(meta$Age[meta$condition == 'Healthy'])
younger.indexes = index[meta$Age <= median.age & meta$condition == 'Healthy']
older.indexes = index[meta$Age > median.age & meta$condition == 'Healthy']
wilcox.test(younger.indexes, older.indexes)

R Markdown From line 137 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

index = readRDS(INDEX.IN.PATH)
meta = readRDS(META.IN.PATH)

R Markdown From line 149 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

index = healthy.index$all.modules.plus.grey.with.tbnks

R Markdown From line 154 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

conditions = setdiff(unique(meta$condition), 'Healthy')
for(condition in conditions) {
  print(condition)
  print(paste0('n = ',sum(meta$condition == condition)))
  res = wilcox.test(index[meta$condition == 'Healthy'], 
              index[meta$condition == condition], 
              alternative = 'greater')
  print(res)
}

R Markdown From line 159 of Figure_4_Statistics/figure_4_stats_version_1.Rmd

knitr::opts_chunk$set(echo = TRUE)
#knitr::opts_knit$set(root.dir = normalizePath("../../.."))

R Markdown From line 8 of Figure_4_Statistics/n_features_used_classifiers.Rmd

#if(!exists("snakemake")){
#  setwd("../../..")
#}

IN.PATH <- snakemake@input[[1]]

design_mats <- readRDS(IN.PATH)

n_feat <- sapply(design_mats, ncol)

keep_cats <- c("somalogic.modules", "microarray.modules", "tbnk", "all.modules")

n_feat[setdiff(names(n_feat), c("somalogic.features", "microarray.features"))]

R Markdown Snakemake From line 18 of Figure_4_Statistics/n_features_used_classifiers.Rmd

library(dplyr)
library(Biobase)

# Set paths
if(!exists("snakemake")){
  setwd("../../..")
  source("scripts/util/paper/parse_snakemake.R")
  parse_snakemake(rule = "figure_4_tables")

}
GVIS.IN.PATHS = list(
  HEALTHY = snakemake@input[["gvis"]]#'Classification/results/healthy_rf_gvis_all.RDS',
)

PVALS.IN.PATHS = list(
  HEALTHY = snakemake@input[["pvals"]]#'Classification/results/healthy_rf_pvals_all.RDS',
)

SOMALOGIC.IN.PATH = snakemake@input[["soma_data"]]#'Data/Somalogic/data_analysis_ready/analysis_ready_sample_level_training_somalogic.rds'

TABLE.OUT.PATHS = list(
  HEALTHY = snakemake@output[["table"]]#'Paper_1_Figures/Figure_4_Tables/healthy_feature_gvi_table.txt',
)

source("scripts/util/Plotting/tbnk_featurename_replace.R")

# Create maps to rename the classifier groups and data types

classifier.groups = list(
  HEALTHY = 'Healthy versus all conditions'
)

classifier.names = list(
  'all.modules.plus.grey.with.tbnks' = 'CBCs, TBNKs, Gene Module Scores, Protein Module Scores, & Grey Module Proteins'
)

# Load the data
gviss = lapply(GVIS.IN.PATHS, readRDS)
pvalss = lapply(PVALS.IN.PATHS, readRDS)
eset = readRDS(SOMALOGIC.IN.PATH)

# Get the feature meta data from the eset and remove the Units column (it's somewhat confusing in the context of the other data types)
somamer.meta.data = fData(eset) %>% select(-Units)

# For each classification task (e.g. healthy versus all)
dfs = mapply(function(gvis, pvals, classifier.group) {
  # For each classifier (e.g. tbnks, protein modules, etc.)
  dfs = mapply(function(gvi, pval, classifier.name, classifier.number) {

    # Make sure the gvis and their corresponding pvalues are in the same order
    stopifnot(names(gvi) == names(pval))

    # Get the feature names associated with the gvis
    feature.names = names(gvi)

    # Initialize the data frame
    df = data.frame(`Feature Name` = feature.names, 
                    GVI = gvi, 
                    Pval = pval,
                    `Classifier Number` = classifier.number,
                    `Classifier Objective` = classifier.group,
                    `Data Types in Classifier` = classifier.name,
                    stringsAsFactors = F,
                    check.names = F)

    # Get the data type of the feature based on the header in the feature name
    df$`Feature Data Type` = df$`Feature Name` %>%
      replace(., grepl('^somalogic\\.grey\\.',.), 'Grey Module Proteins') %>%
      replace(., grepl('^tbnks\\.',.), 'CBCs/TBNKs') %>%
      replace(., grepl('^somalogic\\.modules\\.',.), 'Protein Module') %>%
      replace(., grepl('^microarray\\.modules\\.',.), 'Gene Module')

    # Manaully remove the feature name from the header
    df$`Feature Name` = df$`Feature Name` %>%
      gsub('^somalogic\\.grey\\.','',.) %>%
      gsub('^tbnks\\.','',.) 


    # Rearrang the data frame column orders
    df = df[, c('Classifier Number',
                'Classifier Objective',
                'Data Types in Classifier',
                'Feature Name',
                'Feature Data Type',
                'GVI',
                'Pval')]

    # Add the somamer metadata to help identify somamer
    df = cbind(df, somamer.meta.data[df$`Feature Name`,])

    return(df)

  }, gvis, pvals, classifier.names, 1:6, SIMPLIFY = F)
  df = Reduce(rbind, dfs)
}, gviss, pvalss, classifier.groups, SIMPLIFY = F)


dat <- dfs[[1]]
#Want to keep only classifier number 6 because that is the one that includes everything it seems
#table(dat$`Classifier Number`, dat$`Feature Data Type`)
#dup_feat <- dat[["Feature Name"]] %>% .[duplicated(.)] %>% unique()
#dat %>% filter(`Feature Name` %in% dup_feat)
#table(dat[["Data Types in Classifier"]])

dat <- dat %>% filter(`Classifier Number` == 6)

dat <- dat %>% 
        mutate(AdjP = p.adjust(Pval, method = "fdr"))
ix <- which(colnames(dat) == "Pval")
dat <- bind_cols(dat[, 1:ix], data.frame(AdjP = dat[["AdjP"]]), dat[, (ix + 1):(ncol(dat) -1)])

#lapply(dat, function(x){
#  if(length(unique(x)) > 5){
#          table(table(x))
#  }else{
#    table(x)
#  }
#})
#table(dat[["Feature Name"]])


dat <- dat %>% select(-1)

dat <- dat %>% mutate(`Feature Name` = replace_tbnk_names(`Feature Name`))

dat <- dat %>%
        mutate(#`Feature Name` = 
               `Feature Name`= 
               replace_mod_names_both(`Feature Name`, 
                                      proteome_prefix = "somalogic.modules.", 
                                      transcriptome_prefix = "microarray.modules.")) 

#dat %>% filter(grepl("odule", `Feature Data Type`, ignore.case = T), 
#  !grepl("grey", `Feature Data Type`, ignore.case = T)
#)

readr::write_csv(dat, TABLE.OUT.PATHS[[1]])

R Snakemake dplyr Biobase From line 2 of Figure_4_Tables/figure_4_tables_version_2.R

library(tidyverse)

if(!exists("snakemake")){
  setwd("../../..")
  source("scripts/util/paper/parse_snakemake.R")
  parse_snakemake(rule = "figure_4_meta_analysis_cgp_n_subj")
}

CGPS.CLEAN.IN.PATH <- snakemake@input[["clean_cgps"]]
CGPS.ORIG.IN.PATH <- snakemake@input[["orig_cgps"]]

TAB.OUT.PATH <- snakemake@output[[1]]

cgps_orig <- readRDS(CGPS.ORIG.IN.PATH)
cgps_clean <- readRDS(CGPS.CLEAN.IN.PATH)


dat <- lapply(cgps_clean, function(disease){
  lapply(disease, function(study){
    data.frame(case = length(study$case), control = length(study$control),
    case_samples = paste(study$case, collapse = " "), control_samples = paste(study$control, collapse = " "))
  }) %>% bind_rows(.id = "study_platform")
}) %>% bind_rows(.id = "disease")

cgp_orig_dat <- lapply(cgps_orig, function(disease){
  lapply(disease, function(cgp){
    as.data.frame(cgp$study.info)
  }) %>% bind_rows()
}) %>% bind_rows(.id = "disease")

cgp_orig_dat <- cgp_orig_dat %>% 
        mutate(study_platform = paste(study, platform, sep = "."))

rm_cgps <- c(
  "GSE9006-Diabetes_Mellitus,_Type_1-PBMC_newly diagnosed_paired with 1 month follow up::GSE9006-Healthy-PBMC_unpaired",
  "Jam_human_RA_GSE26554-JIA-PBMC::Jam_human_RA_GSE26554-Control-PBMC",
  "Jam_human_RA_GSE61281-Psoriatric_arthritis-Whole_blood::Cutaneous psoriasis without arthritis_GSE61281-Cutaneous_psoriasis_without_arthritis-Whole_blood",
  "Jam_Human_RA_JIA-PBMC::Jam_Human_RA_Controls-PBMC",
  "Jam_human_RA_GSE26554-Oligoarticular JIA-PBMC::Jam_human_RA_GSE26554-Control-PBMC",
  "Jam_Human_RA_JIA-PBMC::Jam_Human_RA_Controls-PBMC"
)

cgp_orig_dat <- cgp_orig_dat %>% filter(!name %in% rm_cgps)

cgp_orig_summ <- cgp_orig_dat %>%
        group_by(disease, study, platform, study_platform) %>%
        summarise(cgps = paste(name, collapse = "\t"))

dat <- dat %>%
        right_join(cgp_orig_summ)

note_dat <- tribble(
  ~study, ~note,
  "GSE21942", "GSM545843, GSM545845 were removed as these were technical replicates of other samples in the study",
  "GSE30210", "Removed additional replicates such that each individual only had one sample. Selected last sample chronologically",
  "GSE8650", "Removed additional replicates such that each individual only had one sample. Selected last sample chronologically. GSM214490 and GSM214492 were removed as they were believed to have unreliable diagnoses according to the original publication",
  "GSE15645", "Removed patients who were experiencing clinical remission of symptoms",
  "GSE42834", "Removed patients with non-active sarcoid"
)

dat <- dat %>% left_join(note_dat)

dat <- dat %>% filter(disease != "SLE")

write_csv(dat, TAB.OUT.PATH)

R Snakemake tidyverse From line 1 of Figure_4_Tables/meta_analysis_n_subj_per_study.R

library(tidyverse)

#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")

HI.IN.PATH = snakemake@input[["healthy_index"]]#'Classification/results/healthy_rf_results_all.RDS'
META.IN.PATH = snakemake@input[["meta"]]#'Classification/healthy_random_forest_sample_meta_data_all.RDS'

TABLE.OUT.PATH <- snakemake@output[[1]]#"Paper_1_Figures/Figure_4_Tables/hi_results_full_mod.csv"

results = readRDS(HI.IN.PATH)
meta = readRDS(META.IN.PATH)


## Create a data frame with the HI and condition for each subject
df = data.frame(
                patient_id = rownames(results),
                healthy.index = results$all.modules.plus.grey.with.tbnks, 
                condition = meta$condition)


write_csv(df, TABLE.OUT.PATH)

R tidyverse From line 1 of Figure_4_Tables/save_hi.R

library(tidyverse)

# Set paths
#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")
VALIDATION.RESULTS.IN.PATH = snakemake@input[["overall_res"]]#"Reference/jamboree/analysis_output/results/jamboree_z_score_results.RDS"
POOLED.TABLE.OUT.PATH = snakemake@output[["pooled_tab"]]#'Paper_1_Figures/Figure_4_Tables/figure_4_meta_analysis_table.txt'
EFFSIZE.TABLE.OUT.PATH = snakemake@output[["effect_sizes"]]#'Paper_1_Figures/Figure_4_Tables/figure_4_meta_analysis_effsize_table.txt'

source("scripts/util/Plotting/tbnk_featurename_replace.R")
# Load data
scores = readRDS(VALIDATION.RESULTS.IN.PATH)

rownames(scores$metaAnalysis$pooledResults)

pooled <- scores$pooledResults %>%
        rownames_to_column(var = "feature")

pooled <- pooled %>% filter(feature != "microarray.classifier") %>%
        mutate(feature = gsub("PC1", "jPC1", feature)) %>%
        mutate(feature = gsub("somalogic\\.grey", "serum", feature)) %>%
        mutate(feature = gsub("somalogic\\.modules\\.purple", "PM2", feature)) %>%
        mutate(feature = gsub("tbnks\\.", "", feature)) %>%
        mutate(feature = gsub("healthy\\.index", "Immune Health Metric", feature)) %>%
        mutate(feature = replace_tbnk_names(feature))

to_dat <- function(mat, value_col_name){
  dat <- as.data.frame(mat) %>% rownames_to_column(var = "feature") %>%
          gather(key = "study", value = "value", -feature)

  colnames(dat)[colnames(dat)== "value"] <- value_col_name
  dat
}

effect_sizes <- left_join(
  to_dat(scores$datasetEffectSizes, "effectSize"),
  to_dat(scores$datasetEffectSizes, "effectSizeStandardError")
)

effect_sizes <- effect_sizes %>%
        mutate(feature = gsub("PC1", "jPC1", feature)) %>%
        mutate(feature = gsub("somalogic\\.grey", "serum", feature)) %>%
        mutate(feature = gsub("somalogic\\.modules\\.purple", "PM2", feature)) %>%
        mutate(feature = gsub("tbnks\\.", "", feature)) %>%
        mutate(feature = gsub("healthy\\.index", "Immune Health Metric", feature)) %>%
        mutate(feature = replace_tbnk_names(feature))



write_csv(pooled, POOLED.TABLE.OUT.PATH)
write_csv(effect_sizes, EFFSIZE.TABLE.OUT.PATH)

R tidyverse From line 2 of Figure_4_Tables/save_meta_analysis_results.R

library(tidyverse)

#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")

SIGS.IN.PATH <- snakemake@input[[1]]#"Classification/transcriptional_surrogates/surrogate_signatures.RDS"

TABLE.OUT.PATH <- snakemake@output[[1]]#"Paper_1_Figures/Figure_4_Tables/surrogate_sig_genes.csv"

source("scripts/util/Plotting/tbnk_featurename_replace.R")

sigs <- readRDS(SIGS.IN.PATH)

dat_list <- lapply(sigs, function(x){
  sig_members <- sapply(x, paste, collapse = " ")
  data.frame(direction = names(x), gene_symbols = sig_members)
})

dat <- bind_rows(dat_list, .id = "signature")


dat <- dat %>% filter(signature != "microarray.classifier") %>%
        mutate(signature = gsub("PC1", "jPC1", signature)) %>%
        mutate(signature = gsub("somalogic\\.grey", "serum", signature)) %>%
        mutate(signature = gsub("somalogic\\.modules\\.purple", "PM2", signature)) %>%
        mutate(signature = gsub("tbnks\\.", "", signature)) %>%
        mutate(signature = gsub("healthy\\.index", "Immune Health Metric", signature))

dat$signature <- replace_tbnk_names(dat$signature)


write_csv(dat, TABLE.OUT.PATH)

R tidyverse From line 1 of Figure_4_Tables/save_surrogate_signature_genes.R

library(tidyverse)
library(Biobase)
library(ggpubr)

HI.IN.PATH = snakemake@input[["rf_results"]]#'Classification/results/healthy_rf_results_all.RDS'
META.IN.PATH = snakemake@input[["rf_meta"]]#'Classification/random_forest_sample_meta_data.RDS'
SOMA.IN.PATH = snakemake@input[["soma_data"]]#'Data/Somalogic/data_analysis_ready/analysis_ready_subject_level_training_somalogic.rds'

FIG.OUT.PATH = snakemake@output[[1]]#"Paper_1_Figures/Figure_5/cxcl9_cor_immune_health_metric.pdf"

hi_dat <- readRDS(HI.IN.PATH)
meta <- readRDS(META.IN.PATH)
soma <- readRDS(SOMA.IN.PATH)


hi_dat <- hi_dat %>%
        rownames_to_column(var = "patient_id") %>%
        mutate(`Immune Health Metric` = all.modules.plus.grey.with.tbnks) %>%
        select(patient_id, `Immune Health Metric`)

soma_mat <- exprs(soma)
grep("9", rownames(soma_mat), ignore.case = T, value = T)
grep("mig", rownames(soma_mat), ignore.case = T, value = T)

feat_dat <- featureData(soma)@data

feat_dat %>% filter(Target == "MIG")

cxcl9_dat <- pData(soma) %>%
        select(patient_id, condition) %>%
        mutate(`CXCL9` = soma_mat["MIG", ])

plot_dat <- left_join(hi_dat, cxcl9_dat)

plot_dat <- plot_dat %>%
        mutate(healthy = ifelse(condition == "Healthy", "Healthy", "Disease")) %>%
        mutate(healthy = factor(healthy, levels = c("Healthy", "Disease")))

p <- ggplot(plot_dat, aes(x = `CXCL9`, y = `Immune Health Metric`)) +
        geom_point() +
        stat_cor(method = "spearman") +
        theme_bw() +
        facet_wrap(~healthy) +
        ggtitle("Monogenic data using Immune Health Metric scores directly")

ggsave(plot = p, filename = FIG.OUT.PATH, height = 3, width = 6)

R ggplot2 tidyverse ggpubr Biobase From line 1 of Figure_5/cxcl9_ihm_cor.R

library(ggplot2)
library(Biobase)
library(ggpubr)
library(dplyr)
library(tidyr)


# Load paths
if(exists("snakemake")){
  AGING.ESET.IN.PATH = snakemake@input[[1]]#'Reference/ferrucci/processed/aging_eset.RDS'
  PROTEOMIC.SIGNATURE.IN.PATH = snakemake@input[[2]]#'Classification/proteomic_surrogates/healthy.index.plasma.surrogate.somaId.RDS'
  META.ANALYSIS.Z.SCORE.IN.PATH = snakemake@input[[3]]#'Reference/jamboree/analysis_output/results/jamboree_z_score_results.RDS'
  CGPS.IN.PATH = snakemake@input[[4]]#'Reference/jamboree/data_analysis_ready/cgps_clean.RDS'

  FIGURE.5a.OUT.PATH = snakemake@output[[1]]#'Paper_1_Figures/Figure_5/5.a.pdf'
  FIGURE.5c.OUT.PATH = snakemake@output[[2]]#'Paper_1_Figures/Figure_5/5.c.pdf'
  FIGURE.5d.OUT.PATH = snakemake@output[[3]]#'Paper_1_Figures/Figure_5/5.d.pdf'
}else{
  AGING.ESET.IN.PATH = 'Reference/ferrucci/processed/aging_eset.RDS'
  PROTEOMIC.SIGNATURE.IN.PATH = 'Classification/proteomic_surrogates/healthy.index.plasma.surrogate.somaId.RDS'
  META.ANALYSIS.Z.SCORE.IN.PATH = 'Reference/jamboree/analysis_output/results/jamboree_z_score_results.RDS'
  CGPS.IN.PATH = 'Reference/jamboree/data_analysis_ready/cgps_clean.RDS'

  FIGURE.5a.OUT.PATH = 'Paper_1_Figures/Figure_5/5.a.pdf'
  FIGURE.5c.OUT.PATH = 'Paper_1_Figures/Figure_5/5.c.pdf'
  FIGURE.5d.OUT.PATH = 'Paper_1_Figures/Figure_5/5.d.pdf'
  # Source utilities
  setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")
}
source('scripts/util/Signatures/get_signature_scores.R')
# Figure 5a -- Baltimore Healthy Aging Study, age versus HI surrogate signature scatterplot with regression line

## Load the Baltimore Healthy Aging Study eset and the somalogic Healthy Index plasma surrogate signature
eset = readRDS(AGING.ESET.IN.PATH)
healthy.index.surrogate.signature = readRDS(PROTEOMIC.SIGNATURE.IN.PATH)

## Subset the eset to just the samples (not QC, Calibrators, or buffers)
eset = eset[,eset$SampleType == 'Sample']

## Extract the scores from the signature
X = t(exprs(eset))
scores = util.get_signature_score(X, healthy.index.surrogate.signature)
ages = eset$Age

## Put the healthy index surrogate scores into a data frame
df = data.frame(Age = ages, Healthy.Index = scores)

## We create the plot
p = ggplot(df, aes(x = Age, y = Healthy.Index)) + geom_point() + 
  geom_smooth(method = 'lm', formula = y ~ x) + theme_bw() +
  stat_cor(label.x = 70, label.y = .55, label.sep = '\n', output.type = 'text') + 
  ylab('Proteomic Healthy Index Surrgoate') + xlab('Age')

## And save it
ggsave(FIGURE.5a.OUT.PATH, p, height = 4, width = 4)

# Figure 5b -- cartoon description of the meta-analysis
## Please see box: users > Dylan Hirsch > Monogenic Project - Paper 1 > fig5.b.pptx

# Figure 5c -- Predictive ability on external data sets of transcriptional surrogate signatures for top predictors in healthy index
# (Note that the text box saying * p < .10, ** p < .05, *** p < .01 was made manually as it looks nicer with a proper text editor
# than via using ggplot)

## Load metaintegrator meta-analysis scores
results = readRDS(META.ANALYSIS.Z.SCORE.IN.PATH)
results = results$pooledResults # Get the list element holding the meta-analysis results

## Create a data frame with the feature names, effect sizes, a 95% confidence interval, and a BH-adjusted pvalue
df = data.frame(feature = rownames(results), 
                effect.size = results$effectSize, 
                se = 1.96 * results$effectSizeStandardError, 
                pval = results$effectSizeFDR)

## Remove the microarray classifier (which served as our "positive control" classifier to make sure that we weren't losing
## signal by creating surrogate signatures of other data type)
df = df %>% filter(feature != 'microarray.classifier')

## Manually clean up the feature names to be more clear and presentable
df = df %>% 
  mutate(feature = feature %>%
           gsub('somalogic\\.grey\\.','', .) %>%
           gsub('tbnks\\.','', .) %>%
           gsub('\\.',' ', .) %>%
           gsub('nk_cells_percent','NK Cells (%)', .) %>%
           gsub('nk_cells_abs','NK Cells (#)', .) %>%
           gsub('healthy index','Immune Health Metric', .) %>%
           gsub('somalogic modules purple','PM2', .) %>%
           gsub('beta','b', .) %>%
           gsub('PC1','jPC1', .) %>%
           gsub('rdw','RDW', .))

## Order the features by effect size
df = df %>% 
  arrange(desc(effect.size)) %>%
  mutate(feature = factor(feature, levels = feature))

## Create columns for the number of stars to put next to each pvalue
df = df %>%
  mutate(p.value.stars = '') %>%
  mutate(p.value.stars = p.value.stars %>%
           replace(pval < .10,'*') %>%
           replace(pval < .05,'**') %>%
           replace(pval < .01,'***'))

levels(df$feature) <- levels(df$feature) %>%
        gsub(pattern = "", replacement = "")

## Create the plot
p = ggplot(df, aes(x = effect.size, y = feature, text = p.value.stars)) +
  geom_point() +
  geom_errorbarh(aes(xmin=effect.size-se, xmax=effect.size+se), height=0) +
  geom_text(aes(label = p.value.stars), nudge_y = .2, size = 5, show.legend = TRUE) + 
  ylab('Parameter') + xlab('Effect Size') +
  theme_bw() + scale_shape_manual(values = c(15,16,17,18)) +
  theme(axis.title.x = element_text(size = 12),
        axis.text.x = element_text(size = 12),
        axis.title.y = element_text(size = 12),
        axis.text.y = element_text(size = 12))

## Save the  plot
ggsave(FIGURE.5c.OUT.PATH, p, height = 4, width = 6)

# Figure 5d -- Forest plots of signature scores for each study in the meta-analysis

## Load meta-analysis result and comparison group pairs
results = readRDS(META.ANALYSIS.Z.SCORE.IN.PATH)
cgps = readRDS(CGPS.IN.PATH)

## Get a map between the study name and its corresponding disease
### Create an empty vector to hold these names
studiess = c()
### For each disease
for(disease in names(cgps)) {
  ### Get the name of the studies for that disease
  studies = names(cgps[[disease]])
  ### Create a map from the study name to its corresponding disease
  new_studies = rep(disease, length(studies))
  names(new_studies) = studies
  ### Add the map to the empty vector
  studiess = c(studiess, new_studies)
}

## Get the sample size for each study
### For each disease
sizess = lapply(names(cgps), function(disease) {
  ### For each study of that disease
  studies = names(cgps[[disease]])
  ### Get the number of samples in that study
  sizes = sapply(cgps[[disease]], function(study) {
    length(unlist(study))
  })
  ### Name the sizes vector
  names(sizes) = names(cgps[[disease]])
  return(sizes)
})
sizess = unlist(sizess)

## Get the effect sizes and standard errors associated with each study
effects = results$datasetEffectSizes
ses = results$datasetEffectSizeStandardErrors

## Get the overall effect size and standard error
meta_effects = results$pooledResults[, 'effectSize', drop = FALSE]
meta_ses = results$pooledResults[, 'effectSizeStandardError', drop = FALSE]

## Instantiate a function to create the data frame used for the forest plot of a single feature's signature
get_df = function(feature) {
  ## Create an initial data frame with feature names, effect sizes, and standard error
  df1 = data.frame(study = colnames(effects), effect = effects[feature,], se = 1.96 * ses[feature,])

  ## Add the diseases associated with each study, the feature name, and the study size associated with the study
  df1$disease = factor(studiess[df1$study], c('DM1', 'MS', 'RA', 'sarcoid','summary',''))
  df1$feature = feature
  df1$study.size = sizess[df1$study]

  ## We also create a second data frame that is essentially a blank row to separate the diamond from the dots
  df2 = data.frame(study = '', effect = 0, 
                   se = 0, feature = feature,
                   disease = '', study.size = 0)


  ## We create a third data frame that just contains the meta_analysis effect size for display via a triange
  df3 = data.frame(study = 'Summary', effect = meta_effects[feature, ], 
                   se = 1.96 * meta_ses[feature, ], feature = feature,
                   disease = 'summary', study.size = 50)


  ## We put the data frames together
  df = rbind(df1, df2)
  df = rbind(df, df3)

  ## We put all the studies together into a factor
  df$study = factor(df$study, levels = rev(levels(df$study)))
  return(df)
}

## We choose the features we want to show in the plot, and get their corresponding data frames for plotting
features = c("tbnks.nk_cells_abs","tbnks.rdw","somalogic.modules.purple","healthy.index")
dfs = lapply(features, get_df)

## We combine these dataframes
df = Reduce(rbind,dfs)
df$feature = factor(df$feature, features)

## We rename the features for easier viewing
levels(df$feature) = c('NK Cells (#)','RDW','PM2','Immune Health\nMetric')

## We manually create the standard ggplot colors
hues = seq(15, 375, length = 6)
colors = hcl(h = hues, l = 65, c = 100)[1:5]

## And create the forest plot
p = ggplot(df, aes(x = effect, y = study, color = disease)) +
  geom_point(aes(size = study.size, shape = disease), show.legend = T) +
  scale_shape_manual(values = c(16, 16, 16, 16, 18, 16)) + # 16 is for a circle and 18 a triangle
  geom_errorbarh(aes(xmin=effect-se, xmax=effect+se), height=0, show.legend = F, size = 1) +
  scale_color_manual(values = c(colors,'transparent')) + # We want the dot at 0 in the empty row to be transparent (we make it 0 to avoid the warnings from using an NA)
  xlab('Effect Size') + ylab('Study') + 
  theme_bw() + geom_vline(xintercept = 0, linetype = 'dashed') + facet_wrap(~feature, nrow = 1) + # We have a dashed line at 0 to represent no effect
  theme(axis.ticks.y = element_blank()) +
  guides(colour = guide_legend(override.aes = list(size=7)))

## Save the  plot
ggsave(FIGURE.5d.OUT.PATH, p, height = 4, width = 9)

R Snakemake ggplot2 dplyr tidyr Diamond ggpubr Biobase From line 2 of Figure_5/figure_5_version_3.R

library(tidyverse)
library(Biobase)
library(ggpubr)

#if(exists("snakemake")){
AGING.ESET.IN.PATH = snakemake@input[["aging_eset"]]#"Reference/ferrucci/processed/aging_eset.RDS"
PROTEOMIC.SIGNATURE.IN.PATH = snakemake@input[["proteomic_surrogate"]]#'Classification/proteomic_surrogates/healthy.index.plasma.surrogate.somaId.RDS'

PLOT.OUT.PATH = snakemake@output[[1]]#"Paper_1_Figures/Figure_5/il6_cor_immune_health_metric_in_ferrucci_using_surrogate.pdf"


#} else {
#  AGING.ESET.IN.PATH = "Reference/ferrucci/processed/aging_eset.RDS"
#  PROTEOMIC.SIGNATURE.IN.PATH = 'Classification/proteomic_surrogates/healthy.index.plasma.surrogate.somaId.RDS'
#
#
#  PLOT.OUT.PATH = "Paper_1_Figures/Figure_5/il6_cor_immune_health_metric_in_ferrucci_using_surrogate.pdf"
#
#  setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")
#}

source('scripts/util/Signatures/get_signature_scores.R')
# Figure 5a -- Baltimore Healthy Aging Study, age versus HI surrogate signature scatterplot with regression line

## Load the Baltimore Healthy Aging Study eset and the somalogic Healthy Index plasma surrogate signature
eset = readRDS(AGING.ESET.IN.PATH)
healthy.index.surrogate.signature = readRDS(PROTEOMIC.SIGNATURE.IN.PATH)

## Subset the eset to just the samples (not QC, Calibrators, or buffers)
eset = eset[,eset$SampleType == 'Sample']

## Extract the scores from the signature
X = t(exprs(eset))
scores = util.get_signature_score(X, healthy.index.surrogate.signature)

soma_mat <- exprs(eset)
feat_names <- featureData(eset)@data
grep("il-6", feat_names$Target, ignore.case = T, value = T)

il6_id <- feat_names %>%
        filter(Target == "IL-6") %>% 
        pull(SomaId)

il6_scores <- soma_mat[il6_id, ]

## Put the healthy index surrogate scores into a data frame
df = data.frame(`IL-6`= il6_scores, `Immune Health Metric`= scores, check.names = FALSE)

p <- ggplot(df, aes(x = `IL-6`, y = `Immune Health Metric`)) +
        geom_point() +
        stat_cor(method = "spearman") +
        theme_bw() +
        ggtitle("Ferrucci data using IHM surrogate")

ggsave(plot = p, filename = PLOT.OUT.PATH, height = 3, width = 3)

R Snakemake ggplot2 tidyverse ggpubr Biobase From line 1 of Figure_5/il6_ihm_cor_ferrucci_data.R

library(tidyverse)
library(Biobase)
library(ggpubr)

#if(exists("snakemake")){
HI.IN.PATH = snakemake@input[["rf_results"]]#'Classification/results/healthy_rf_results_all.RDS'
META.IN.PATH = snakemake@input[["rf_meta"]]#'Classification/random_forest_sample_meta_data.RDS'
SOMA.IN.PATH = snakemake@input[["soma_data"]]#'Data/Somalogic/data_analysis_ready/analysis_ready_subject_level_training_somalogic.rds'

PLOT1.OUT.PATH = snakemake@output[[1]]#"Paper_1_Figures/Figure_5/il6_cor_immune_health_metric.pdf"
PLOT2.OUT.PATH = snakemake@output[[2]]#"Paper_1_Figures/Figure_5/il6_cor_immune_health_metric.pdf"
#} else {
#  HI.IN.PATH = 'Classification/results/healthy_rf_results_all.RDS'
#  META.IN.PATH = 'Classification/random_forest_sample_meta_data.RDS'
#  SOMA.IN.PATH = 'Data/Somalogic/data_analysis_ready/analysis_ready_subject_level_training_somalogic.rds'
#
#  PLOT.OUT.PATH = "Paper_1_Figures/Figure_5/il6_cor_immune_health_metric.pdf"
#
#  setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")
#}

source("scripts/util/paper/abbrev_cond.R")

hi_dat <- readRDS(HI.IN.PATH)
meta <- readRDS(META.IN.PATH)
soma <- readRDS(SOMA.IN.PATH)


hi_dat <- hi_dat %>%
        rownames_to_column(var = "patient_id") %>%
        mutate(`Immune Health Metric` = all.modules.plus.grey.with.tbnks) %>%
        select(patient_id, `Immune Health Metric`)

soma_mat <- exprs(soma)
#grep("il.6", rownames(soma_mat), ignore.case = T, value = T)

il6_dat <- pData(soma) %>%
        select(patient_id, condition) %>%
        mutate(`IL-6` = soma_mat["IL.6", ])

plot_dat <- left_join(hi_dat, il6_dat)

plot_dat <- plot_dat %>% 
        mutate(condition = abbrev_cond(condition))

plot_dat <- plot_dat %>%
        mutate(healthy = ifelse(condition == "Healthy", "Healthy", "Disease")) %>%
        mutate(healthy = factor(healthy, levels = c("Healthy", "Disease")))


p <- ggplot(plot_dat, aes(x = `IL-6`, y = `Immune Health Metric`)) +
        geom_point() +
        stat_cor(method = "spearman") +
        theme_bw() +
        facet_wrap(~healthy) +
        ggtitle("Monogenic data using Immune Health Metric scores directly")

ggsave(plot = p, filename = PLOT1.OUT.PATH, height = 3, width = 6)


p <- ggplot(plot_dat %>% filter(condition != "Healthy"), aes(x = `IL-6`, y = `Immune Health Metric`)) +
        geom_text(aes(label = condition, color= condition)) +
        stat_cor(method = "spearman") +
        theme_bw() +
        #facet_wrap(~healthy) +
        ggtitle("Monogenic data using Immune Health Metric scores directly")

ggsave(plot = p, filename = PLOT2.OUT.PATH, height = 8, width = 15)

R Snakemake ggplot2 tidyverse ggpubr Biobase From line 1 of Figure_5/il6_ihm_cor.R

knitr::opts_chunk$set(echo = TRUE)
#knitr::opts_knit$set(root.dir = normalizePath("../../../"))

R Markdown From line 9 of Figure_5_Statistics/figure_5_stats_version_1.Rmd

library(ggplot2)
library(Biobase)
library(ggpubr)
library(MetaIntegrator)
library(limma)

R Markdown ggplot2 ggpubr Biobase MetaIntegrator From line 14 of Figure_5_Statistics/figure_5_stats_version_1.Rmd

source('scripts/util/Enrichment/hyperGeo.R')

R Markdown From line 23 of Figure_5_Statistics/figure_5_stats_version_1.Rmd

# Baltimore Aging Study eset
ESET.IN.PATH = snakemake@input[[1]] #'Reference/ferrucci/processed/aging_eset.RDS'
# HI proteomic surrogate signature
SIGNATURE.IN.PATH = snakemake@input[[2]] #'Classification/proteomic_surrogates/healthy.index.plasma.surrogate.somaId.RDS'
# Baltimore Aging Study somamer associations from their paper
TABLE.IN.PATH = snakemake@input[[3]] #'Reference/ferrucci/raw/acel12799-sup-0004-TableS3.txt'

R Markdown From line 28 of Figure_5_Statistics/figure_5_stats_version_1.Rmd

eset = readRDS(ESET.IN.PATH) ## The baltimore aging cohort eset
feature.data = fData(eset) ## The baltimore aging cohort eset
signature = readRDS(SIGNATURE.IN.PATH) ## The plasma somalogic surrogate signature for the HI
table = read.table(TABLE.IN.PATH, header = TRUE, sep = '\t') ## The Ferrucci data significance table

R Markdown From line 40 of Figure_5_Statistics/figure_5_stats_version_1.Rmd

signature = unname(unlist(signature)) ## Get the somalogic proteins used in the proteomic surrorgate signature of the HI
signature = signature[signature %in% feature.data$SomaId] ## subset to only the proteins measured by Ferrucci
pvals = table$p ## Get the p values from the Ferrucci data set
names(pvals) = table$SomaId
## Test whether the pvalues of proteins in the HI surrogate signature are more significant than those outside the
## HI surrogate signature
wilcox.test(pvals[names(pvals) %in% signature], pvals[! names(pvals) %in% signature], alternative = 'less')

R Markdown From line 48 of Figure_5_Statistics/figure_5_stats_version_1.Rmd

pvals = pvals[p.adjust(pvals, 'fdr') < .05]

R Markdown From line 61 of Figure_5_Statistics/figure_5_stats_version_1.Rmd

hyperGeoTest(names(pvals), feature.data$SomaId, signature)

R Markdown From line 65 of Figure_5_Statistics/figure_5_stats_version_1.Rmd

library(tidyverse)
library(Biobase)
library(BiocGenerics)

if(!exists("snakemake")){
  setwd("../../..")
  source("scripts/util/paper/parse_snakemake.R")
  parse_snakemake(rule = "figure_5_tables_cxcl9_ihm_age_regression")
}


AGING.ESET.IN.PATH = snakemake@input[[1]]
PROTEOMIC.SIGNATURE.IN.PATH = snakemake@input[[2]]

PLOT.OUT.PATH = snakemake@output[[1]]#"Paper_1_Figures/Figure_5/cxcl9_cor_immune_health_metric_in_ferrucci_using_surrogate.pdf"


source('scripts/util/Signatures/get_signature_scores.R')
## Load the Baltimore Healthy Aging Study eset and the somalogic Healthy Index plasma surrogate signature
eset = readRDS(AGING.ESET.IN.PATH)
healthy.index.surrogate.signature = readRDS(PROTEOMIC.SIGNATURE.IN.PATH)

## Subset the eset to just the samples (not QC, Calibrators, or buffers)
eset = eset[,eset$SampleType == 'Sample']

## Extract the scores from the signature
X = t(exprs(eset))
scores = util.get_signature_score(X, healthy.index.surrogate.signature)

soma_mat <- exprs(eset)
feat_names <- featureData(eset)@data
#grep("il-6", feat_names$Target, ignore.case = T, value = T)
grep("MIG", feat_names$Target, ignore.case = T, value = T)

cxcl9_id <- feat_names %>%
        filter(Target == "MIG") %>% 
        pull(SomaId)

cxcl9_scores <- soma_mat[cxcl9_id, ]

## Put the healthy index surrogate scores into a data frame
df = data.frame(`cxcl9`= cxcl9_scores, `ihm`= scores,
                age = eset$Age,
                check.names = FALSE)

bmore_mod <- lm(ihm ~ age + cxcl9, data = df)
bmore_summ <- summary(bmore_mod)
bmore_dat <- as.data.frame(bmore_summ$coefficients) %>%
        rownames_to_column(var = "term")



#---- monogenic
hi.in.path = snakemake@input[[3]]
meta.in.path = snakemake@input[[4]]
soma.in.path = snakemake@input[[5]]

#fig.out.path = snakemake@output[[1]]#"paper_1_figures/figure_5/cxcl9_cor_immune_health_metric.pdf"

hi_dat <- readRDS(hi.in.path)
meta <- readRDS(meta.in.path)
soma <- readRDS(soma.in.path)


hi_dat <- hi_dat %>%
        rownames_to_column(var = "patient_id") %>%
        mutate(`ihm` = all.modules.plus.grey.with.tbnks) %>%
        select(patient_id, `ihm`)

soma_mat <- exprs(soma)
grep("9", rownames(soma_mat), ignore.case = t, value = t)
grep("mig", rownames(soma_mat), ignore.case = t, value = t)

feat_dat <- featureData(soma)@data

feat_dat %>% filter(Target == "MIG")

cxcl9_dat <- pData(soma) %>%
        select(patient_id, condition) %>%
        mutate(`cxcl9` = soma_mat["MIG", ]) %>%
        mutate(age = soma$Age)

df <- left_join(hi_dat, cxcl9_dat)

## put the healthy index surrogate scores into a data frame
#df = data.frame(`cxcl9`= , `ihm`= scores,
#                age = eset$age,
#                check.names = false)

mono_mod <- lm(ihm ~ age + cxcl9, data = df)
mono_summ <- summary(mono_mod)
mono_dat <- as.data.frame(mono_summ$coefficients) %>%
        rownames_to_column(var = "term")


combined_dat <- bind_rows(list(Monogenic = mono_dat, Baltimore = bmore_dat), .id = "Study")

TAB.OUT.PATH <- snakemake@output[[1]]

write_csv(combined_dat, TAB.OUT.PATH)

R Snakemake tidyverse Biobase BiocGenerics From line 1 of Figure_5_Tables/cxcl9_regress_age_ihm.R

library(dplyr)

# Set paths
VALIDATION.RESULTS.IN.PATH = snakemake@input[[1]]#'Reference/jamboree/analysis_output/results/jamboree_gene_level_results.RDS'
TABLE.OUT.PATH = snakemake@output[[1]]#'Paper_1_Figures/Figure_5_Tables/figure_5_meta_analysis_table.txt'

# Load data
scores = readRDS(VALIDATION.RESULTS.IN.PATH)

# Save to table
write.table(scores, file = TABLE.OUT.PATH, sep = "\t", row.names = F, col.names = T)

R dplyr From line 2 of Figure_5_Tables/figure_5_tables_version_1.R

library(tidyverse)
library(Biobase)
library(BiocGenerics)

#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")
SIG.IN.PATH <- snakemake@input[["sig"]]#"Classification/proteomic_surrogates/healthy.index.plasma.surrogate.somaId.RDS"
ESET.IN.PATH <- snakemake@input[["eset"]]#"Data/Somalogic/data_analysis_ready/analysis_ready_sample_level_training_somalogic.rds"

TABLE.OUT.PATH <- snakemake@output[[1]]#"Paper_1_Figures/Figure_5_Tables/proteomic_surrogate_ihm.csv"

sig <- readRDS(SIG.IN.PATH)
eset <- readRDS(ESET.IN.PATH)


featdata <- featureData(eset)@data %>%
        rename(feature = Target) 


sig_dat <- lapply(sig, function(x){
  data.frame(SomaId= x, stringsAsFactors = FALSE)
}) %>% bind_rows(.id = "direction")

sig_dat <- sig_dat %>% left_join(featdata)

sig_dat <- sig_dat %>% 
        select(-Dilution) %>%
        select(feature, everything())

write_csv(sig_dat, TABLE.OUT.PATH)

R tidyverse Biobase BiocGenerics From line 1 of Figure_5_Tables/proteomic_hi_surrogate.R

library(Biobase)
library(pheatmap)
library(ggplot2)
library(dplyr)
library(tidyr)
library(reshape2)
library(cowplot)

# Set paths
## Monogenic metadata
#if(exists("snakemake")){
METADATA.IN.PATH = snakemake@input[[1]]#'Metadata/monogenic.de-identified.metadata.RData'
## Microarray variance partition results with condition and medication covariates
MICROARRAY.FEATURE.VP.IN.PATH = snakemake@input[[2]]#'Data/Microarray/analysis_output/variance_decomposition/microarray_features_vp.RDS'
## Somalogic variance partition results with condition and medication covariates
SOMALOGIC.FEATURE.VP.IN.PATH = snakemake@input[[3]]#'Data/Somalogic/analysis_output/variance_decomposition/somalogic_features_vp.RDS'

## Microarray modules variance partition results with just patient as a covariate
MICROARRAY.MODULES.SIMPLE.VP.IN.PATH = snakemake@input[[4]]#'Data/Microarray/analysis_output/stability/microarray_modules_standard_vp.rds'
## Somalogic modules variance partition results with just patient as a covariate
SOMALOGIC.MODULES.SIMPLE.VP.IN.PATH = snakemake@input[[5]]#'Data/Somalogic/analysis_output/stability/somalogic_modules_standard_vp.rds'
## TBNKs modules variance partition results with just patient as a covariate
TBNKS.SIMPLE.VP.IN.PATH = snakemake@input[[6]]#'Data/TBNK/analysis_output/stability/tbnks_features_standard_vp.rds'
## Microarray features variance partition results with just patient as a covariate
MICROARRAY.FEATURES.SIMPLE.VP.IN.PATH = snakemake@input[[7]]#'Data/Microarray/analysis_output/stability/microarray_features_standard_vp.rds'
## Somalogic features variance partition results with just patient as a covariate
SOMALOGIC.FEATURES.SIMPLE.VP.IN.PATH = snakemake@input[[8]]#'Data/Somalogic/analysis_output/stability/somalogic_features_standard_vp.rds'

SUPPLEMENTAL.FIGURE.1a.OUT.PATH = snakemake@output[[1]]#'Paper_1_Figures/Supplemental_Figure_1/S1a.pdf'
SUPPLEMENTAL.FIGURE.1b.OUT.PATH = snakemake@output[[2]]#'Paper_1_Figures/Supplemental_Figure_1/S1b.pdf'
SUPPLEMENTAL.FIGURE.1c.OUT.PATH = snakemake@output[[3]]#'Paper_1_Figures/Supplemental_Figure_1/S1c.pdf'
SUPPLEMENTAL.FIGURE.1d.OUT.PATH = snakemake@output[[4]]#'Paper_1_Figures/Supplemental_Figure_1/S1d.pdf'
SUPPLEMENTAL.FIGURE.1e.OUT.PATH = snakemake@output[[5]]#'Paper_1_Figures/Supplemental_Figure_1/S1e.pdf'
SUPPLEMENTAL.FIGURE.1f.OUT.PATH = snakemake@output[[6]]#'Paper_1_Figures/Supplemental_Figure_1/S1f.pdf'
#}else{
#METADATA.IN.PATH = 'Metadata/monogenic.de-identified.metadata.RData'
### Microarray variance partition results with condition and medication covariates
#MICROARRAY.FEATURE.VP.IN.PATH = 'Data/Microarray/analysis_output/variance_decomposition/microarray_features_vp.RDS'
### Somalogic variance partition results with condition and medication covariates
#SOMALOGIC.FEATURE.VP.IN.PATH = 'Data/Somalogic/analysis_output/variance_decomposition/somalogic_features_vp.RDS'
#
### Microarray modules variance partition results with just patient as a covariate
#MICROARRAY.MODULES.SIMPLE.VP.IN.PATH = 'Data/Microarray/analysis_output/stability/microarray_modules_standard_vp.rds'
### Somalogic modules variance partition results with just patient as a covariate
#SOMALOGIC.MODULES.SIMPLE.VP.IN.PATH = 'Data/Somalogic/analysis_output/stability/somalogic_modules_standard_vp.rds'
### TBNKs modules variance partition results with just patient as a covariate
#TBNKS.SIMPLE.VP.IN.PATH = 'Data/TBNK/analysis_output/stability/tbnks_features_standard_vp.rds'
### Microarray features variance partition results with just patient as a covariate
#MICROARRAY.FEATURES.SIMPLE.VP.IN.PATH = 'Data/Microarray/analysis_output/stability/microarray_features_standard_vp.rds'
### Somalogic features variance partition results with just patient as a covariate
#SOMALOGIC.FEATURES.SIMPLE.VP.IN.PATH = 'Data/Somalogic/analysis_output/stability/somalogic_features_standard_vp.rds'
#
#SUPPLEMENTAL.FIGURE.1a.OUT.PATH = 'Paper_1_Figures/Supplemental_Figure_1/S1a.pdf'
#SUPPLEMENTAL.FIGURE.1b.OUT.PATH = 'Paper_1_Figures/Supplemental_Figure_1/S1b.pdf'
#SUPPLEMENTAL.FIGURE.1c.OUT.PATH = 'Paper_1_Figures/Supplemental_Figure_1/S1c.pdf'
#SUPPLEMENTAL.FIGURE.1d.OUT.PATH = 'Paper_1_Figures/Supplemental_Figure_1/S1d.pdf'
#SUPPLEMENTAL.FIGURE.1e.OUT.PATH = 'Paper_1_Figures/Supplemental_Figure_1/S1e.pdf'
#SUPPLEMENTAL.FIGURE.1f.OUT.PATH = 'Paper_1_Figures/Supplemental_Figure_1/S1f.pdf'
#
#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")
#
#}
# Source utilities
source('scripts/util/Plotting/colors.R')
source('scripts/util/Processing/averageRepeatSamples.R')
source('scripts/util/paper/abbrev_cond.R')
source("scripts/util/Plotting/tbnk_featurename_replace.R")


# Supplemental Figure 1a -- density plot of ages in case and control with lines for medians

## Load metadata
load(METADATA.IN.PATH)

## Get ages for patients in training set
df = monogenic.all.assays %>%
  select(visit_id, patient_id, patient_age_at_time_of_blood_draw, condition, analysis_group) %>%
  mutate(patient_id = paste0('P', patient_id)) %>%
  mutate(age = patient_age_at_time_of_blood_draw) %>%
  mutate(group = ifelse(condition == "Healthy", "Healthy", "Disease")) %>%
  mutate(group = factor(group, levels = c("Healthy", "Disease"))) %>%
  filter(analysis_group == 'Discovery') %>%
  select(-patient_age_at_time_of_blood_draw, -analysis_group)

## Average ages over various visits, first by averaging ages from samples within a visit (should all be almost exactly the same),
## and then averaging ages from samples across visits
df = df %>%
  group_by(visit_id) %>%
  summarise(age = mean(age), patient_id = unique(patient_id), condition = unique(condition), group = unique(group)) %>%
  ungroup() %>%
  group_by(patient_id) %>%
  summarise(age = mean(age), condition = unique(condition), group = unique(group)) %>%
  ungroup()

## Find median age for case and control
df.medians = df %>%
  group_by(group) %>%
  summarise(`Group Median Age` = median(age))

## Make density plot
p = ggplot(df, aes(x = age, fill = group)) + geom_density(alpha = .5) +
  geom_vline(aes(xintercept = `Group Median Age`, color = group), data = df.medians) +
  xlab('Age') + ylab('Density') + ggtitle('Age Distributions') +
  theme_bw() +
  theme(axis.ticks.y = element_blank(), 
        axis.text.y = element_blank(),
        axis.title.y = element_text(size = 15),
        axis.text.x = element_text(size = 15),
        axis.title.x = element_text(size = 15),
        title = element_text(size = 15))
ggsave(SUPPLEMENTAL.FIGURE.1a.OUT.PATH, p, device = 'pdf', height = 4, width = 6)

# Supplemental Figure 1b -- barplots of ages by condition

## Load metadata
load(METADATA.IN.PATH)

## Get metadata for patients in training set
## Get ages for patients in training set
df = monogenic.all.assays %>%
  select(visit_id, patient_id, patient_age_at_time_of_blood_draw, condition, analysis_group) %>%
  mutate(patient_id = paste0('P', patient_id)) %>%
  mutate(age = patient_age_at_time_of_blood_draw) %>%
  mutate(group = ifelse(condition == "Healthy", "Healthy", "Disease")) %>%
  mutate(group = factor(group, levels = c("Healthy", "Disease"))) %>%
  filter(analysis_group == 'Discovery') %>%
  select(-patient_age_at_time_of_blood_draw, -analysis_group)

## Average ages over various visits, first by averaging ages from samples within a visit (should all be almost exactly the same),
## and then averaging ages from samples across visits
df = df %>%
  group_by(visit_id) %>%
  summarise(age = mean(age), patient_id = unique(patient_id), condition = unique(condition), group = unique(group)) %>%
  ungroup() %>%
  group_by(patient_id) %>%
  summarise(age = mean(age), condition = unique(condition), group = unique(group)) %>%
  ungroup()

## Get the median age associated with each condition
median.ages = df %>%
  group_by(condition) %>%
  summarise(median.age = median(age)) %>%
  ungroup()

## Append the median age of the condition to the data frame, and order by median age
df = df %>%
  right_join(median.ages, by = 'condition') %>%
  arrange(median.age) %>%
  mutate(condition = condition %>% as.character %>% abbrev_cond) %>%
  mutate(condition = factor(condition, levels = unique(condition)))

## Create boxplots
p = ggplot(df, aes(x = condition, y = age, fill = group)) + 
  geom_boxplot(outlier.shape = NA) + 
  xlab('Condition') + ylab('Age') + ggtitle('Condition-Specific Age Distributions') +
  theme_bw() +
  theme(axis.text.y = element_text(size = 15),
        axis.title.y = element_text(size = 15),
        axis.text.x = element_text(angle = 90, hjust = 1, size = 15, vjust = .4),
        axis.title.x = element_text(size = 15),
        title = element_text(size = 15))
ggsave(SUPPLEMENTAL.FIGURE.1b.OUT.PATH, p, device = 'pdf', height = 4, width = 6)

# Supplemental Figure 1c -- Gender split stacked barplot

## Load metadata
load(METADATA.IN.PATH)

## Get metadata for patients in training set
df = monogenic.all.assays %>%
  select(patient_id, gender, condition, analysis_group) %>%
  unique() %>%
  mutate(patient_id = paste0('P', patient_id)) %>%
  filter(analysis_group == 'Discovery')

## Get the total number of subjects of each gender and each condition
df = df %>%
  group_by(condition, gender) %>%
  summarise(gender.total = length(patient_id)) %>%
  ungroup()

## Get the total number of subjects of each condition
df.total = df %>%
  group_by(condition) %>%
  summarise(total = sum(gender.total)) %>%
  ungroup()

## Get the percent of subjects from each gender within a condition and sort by that fraction
df = df %>%
  right_join(df.total, by = 'condition') %>%
  mutate(percent = gender.total / total) %>%
  select(-gender.total, -total) %>%
  mutate(group = ifelse(condition == 'Healthy', 'Control', 'Case')) %>%
  mutate(percent.female = ifelse(gender == 'F', percent, 1 - percent)) %>%
  arrange(desc(group), desc(percent.female)) %>%
  mutate(condition = condition %>% as.character %>% abbrev_cond) %>%
  mutate(condition = factor(condition, levels = unique(condition)))

## Create the barplots
p = ggplot(df, aes(x = condition, y = percent, fill = gender)) + geom_bar(stat = 'identity') + 
  theme_bw() +
  xlab('Condition') + ylab('Percent') + ggtitle('Gender Split by Condition') +
  scale_fill_viridis_d() +
  theme(axis.text.y = element_text(size = 15),
        axis.title.y = element_text(size = 15),
        axis.text.x = element_text(angle = 90, hjust = 1, size = 15, vjust = .4),
        axis.title.x = element_text(size = 15),
        title = element_text(size = 15))
ggsave(SUPPLEMENTAL.FIGURE.1c.OUT.PATH, p, device = 'pdf', height = 4, width = 6)

# Supplemental Figure 1d -- violin plots of medication-specific effects

## Extract protein and gene variance parititon results (with condition and medication covariates included)
microarray.vp = readRDS(MICROARRAY.FEATURE.VP.IN.PATH)
somalogic.vp = readRDS(SOMALOGIC.FEATURE.VP.IN.PATH)

## Extract the medication names
medications = setdiff(colnames(microarray.vp), c('Patient','Condition', 'Residuals'))

## Insantiate a function to summarize the variance partition into a dataframe
summarize_vp = function(results) {
  df = data.frame(results)
  df = melt(df)
  df$variable = factor(df$variable, levels = c('Patient','Condition', medications, 'Residuals'))
  return(df)
}

## Insantiate a function to make the violin plot from the extracted data frame
violin_plot = function(df, colors) {
  ggplot(df, aes(x = variable, y = value, fill = variable)) + theme_bw() + 
    geom_violin(scale = "width", position = position_dodge(.8), width = .7, show.legend = FALSE) +
    scale_fill_manual(values = colors) + ylab('Variance Explained') + xlab('Covariate')
}

## Extract the variance parititon results for the genes and proteins into separate data frames
df.microarray = summarize_vp(microarray.vp)
df.somalogic = summarize_vp(somalogic.vp)

## Make the plot for the proteins
colors = c('violetred','plum1', rep('thistle1', length(medications)),'grey')
p1 = violin_plot(df.somalogic, colors) + ggtitle('Proteomic Features') +
  theme(
    axis.text.x = element_text(size = 10, angle = 30, hjust = 1),
    axis.title.x = element_text(size = 10),
    axis.text.y = element_text(size = 10),
    axis.title.y = element_text(size = 10))

## Make the plot for the genes
colors = c('royalblue4','royalblue', rep('lightblue', length(medications)),'grey')
p2 = violin_plot(df.microarray, colors) + ggtitle('Transcriptomic Features') +
  theme(
    axis.text.x = element_text(size = 10, angle = 30, hjust = 1),
    axis.title.x = element_text(size = 10),
    axis.text.y = element_blank(),
    axis.title.y = element_blank(),
    axis.ticks.y = element_blank())

p = plot_grid(p1, p2, align = "h", ncol = 2, rel_widths = c(10,9))
ggsave(SUPPLEMENTAL.FIGURE.1d.OUT.PATH, p, device = 'pdf', height = 3, width = 7)

# Supplemental Figure 1e - Simple variance partition module and tbnk effects

## Load the data
microarray.vp = readRDS(MICROARRAY.MODULES.SIMPLE.VP.IN.PATH)
somalogic.vp = readRDS(SOMALOGIC.MODULES.SIMPLE.VP.IN.PATH)
tbnks.vp = readRDS(TBNKS.SIMPLE.VP.IN.PATH)

## Instantiate a function to summarize each variance partition into a data frame
extract_results = function(results) {
  df = data.frame(results)
  df = df %>% 
    select(-Residuals) %>%
    tibble::rownames_to_column(var = 'module') %>%
    arrange(Patient) %>%
    mutate(module = factor(module, levels = unique(module)))
  return(df)
}

## Instantiate a function to create the barplot from the extracted results
bar_plot = function(df, color) {
  p = ggplot(df, aes(x = module, y = Patient)) +
    geom_bar(stat = 'identity', fill = color, show.legend = TRUE) +
    theme_bw() + ylim(0,1) + coord_flip() + 
    geom_hline(yintercept = .5, linetype = 'dashed', color = 'black') +
    ylab('Percent variation explained by Patient')
}

## Panel 1 -- TBNKs
### Rename the tbnk features names to make them clearer and more concside
df.tbnks = extract_results(tbnks.vp)
levels(df.tbnks$module) = levels(df.tbnks$module) %>% replace_tbnk_names()

df.tbnks <- df.tbnks %>%
        mutate(category = tbnk_groups(module, "new name"))

### Create the tbnk barplot
p.tbnks = bar_plot(df.tbnks, color = 'seagreen3') + 
  xlab('') +
  facet_grid(category~1, space = "free", scales = "free_y") +
  ylab('Variance Explained') + 
  theme(
    axis.text.x = element_text(size = 15),
    axis.text.y = element_text(size = 15),
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 15),
    strip.text.y = element_text(size = 15),
    legend.text = element_text(size = 15),
    legend.title = element_text(size = 15),
    strip.text.x = element_blank(),
    strip.background.x = element_blank()
  )

## Panel 2
df.somalogic = extract_results(somalogic.vp)

levels(df.somalogic$module) <- replace_mod_names_single_type(levels(df.somalogic$module), sheet = "PM")

### Create the somalogic barplot
p.somalogic = bar_plot(df.somalogic, color = 'violetred') + 
  xlab('') + 
  facet_grid("PM" ~ 1) +
  theme(
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank(),
    axis.text.y = element_text(size = 15),
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 15),
    strip.text.y = element_text(size = 15),
    legend.text = element_text(size = 15),
    legend.title = element_text(size = 15),
    strip.text.x = element_blank(),
    strip.background.x = element_blank()
  )

## Panel 3
df.microarray = extract_results(microarray.vp)
levels(df.microarray$module) <- replace_mod_names_single_type(levels(df.microarray$module), sheet = "TM")

### Create the microarray bar plot
p.microarray = bar_plot(df.microarray, color = 'royalblue4') + 
  xlab('') + 
  ylab('') + 
  facet_grid("TM" ~ 1) +
  theme(
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank(),
    axis.text.y = element_text(size = 15),
    axis.title.x = element_text(size = 15),
    axis.title.y = element_text(size = 15),
    strip.text.y = element_text(size = 15),
    legend.text = element_text(size = 15),
    legend.title = element_text(size = 15),
    strip.text.x = element_blank(),
    strip.background.x = element_blank()
  )

## Put the panels together
p = plot_grid(p.microarray, p.somalogic, p.tbnks, 
              align = "hv", 
              axis = "tblr",
              nrow = 3, 
              rel_heights = c(nrow(df.microarray), nrow(df.somalogic) + 2, nrow(df.tbnks) + 7))
ggsave(SUPPLEMENTAL.FIGURE.1e.OUT.PATH, p, device = 'pdf', height = 12, width = 7)


# Supplemental Figure 1f - Feature percentiles for protein and gene
microarray.vp = readRDS(MICROARRAY.FEATURES.SIMPLE.VP.IN.PATH)
somalogic.vp = readRDS(SOMALOGIC.FEATURES.SIMPLE.VP.IN.PATH)

## Create the dataframes listing what percentile each feature is for variance explained by patient
## and what the corresponding variance explained is
df.microarray = microarray.vp %>%
  data.frame() %>%
  tibble::rownames_to_column(var = 'feature') %>%
  select(-Residuals) %>%
  mutate(data.type = 'WB Transcriptome') %>%
  arrange(Patient) %>%
  tibble::rowid_to_column(var = 'percentile') %>%
  mutate(percentile = percentile / nrow(microarray.vp))

df.somalogic = somalogic.vp %>%
  data.frame() %>%
  tibble::rownames_to_column(var = 'feature') %>%
  select(-Residuals) %>%
  mutate(data.type = 'Serum Proteins') %>%
  arrange(Patient) %>%
  tibble::rowid_to_column(var = 'percentile') %>%
  mutate(percentile = percentile / nrow(somalogic.vp))

## Put together the gene and protein data frames
df = rbind(df.microarray, df.somalogic)

## Create the percentile plots
p = ggplot(df, aes(x = percentile, y = Patient)) + 
  geom_bar(stat = 'identity', color = 'cyan', fill = 'cyan') + 
  facet_wrap(~ data.type, scales = 'free_x') +
  theme_bw() +
  labs(x = 'Percentile', y = 'Variance Explained')

ggsave(SUPPLEMENTAL.FIGURE.1f.OUT.PATH, p, device = 'pdf', height = 2.5, width = 5)

R Snakemake ggplot2 dplyr tidyr cowplot reshape2 Biobase From line 2 of Supplemental_Figure_1/supplemental_figure_1_version_3.R

library(tidyverse)
library(gridExtra)


#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")

TBNK.PATH <- snakemake@input[["tbnk"]]#"Data/TBNK/data_analysis_ready/tbnk_training_subject_level_eset.rds"
SOMA.PATH <- snakemake@input[["soma"]]#"Data/Somalogic/analysis_output/wgcna_results/scores_subject_level.rds"
ARRAY.PATH <- snakemake@input[["array"]]#"Data/Microarray/analysis_output/WGCNA/array_subject_scores.rds"

FIG.OUT.PATH <- snakemake@output[[1]]#"Paper_1_Figures/Supplemental_Figure_1/tbnk_cor_w_modules2.pdf"

tbnk.eset <- readRDS(TBNK.PATH)
soma.modules <- readRDS(SOMA.PATH)
array.modules <- readRDS(ARRAY.PATH)

#rename modules
source("scripts/util/Plotting/tbnk_featurename_replace.R")

#Put expressionsets into list and make sure that the patient_id are sampleNames/rownames of the expression matrix

eset.list <- list(protein = soma.modules, gene = array.modules, tbnk = tbnk.eset)
eset.list <- lapply(eset.list, function(eset){
  sampleNames(eset) <- eset[["patient_id"]]
  eset
})
featureNames(eset.list$protein) <- replace_mod_names_single_type(featureNames(eset.list$protein), "PM")

featureNames(eset.list$gene) <- replace_mod_names_single_type(featureNames(eset.list$gene), "TM")


tbnk_mat <- exprs(eset.list$tbnk) %>% t()

do_cortest <- function(x, y, method){
  intersection <- intersect(names(x), names(y))

  x <- x[match(intersection, names(x))]
  y <- y[match(intersection, names(y))]

  stopifnot(all.equal(names(x), names(y)))

  cor.test(x, y, method = method)
}


get_cor_dat<- function(eset, mat2, method){
  intersection <- intersect(rownames(mat2), eset$patient_id)
  mat <- exprs(eset)
  mat <- mat[ ,match(intersection, eset$patient_id)]
  mat <- mat[complete.cases(mat),]
  mat <- t(mat)

  mat2 <- mat2[match(intersection, rownames(mat2)),]
  stopifnot(all.equal(rownames(mat), rownames(mat2)))

  lapply(colnames(mat2), function(feature_x){
    lapply(colnames(mat), function(feature_y){
      x <- mat2[, feature_x]
      y <- mat[, feature_y]
      result <- do_cortest(x, y, method = method)
      data.frame(cor = result$estimate, p = result$p.value, feature_x = feature_x, feature_y = feature_y)
    }) %>% bind_rows()
  }) %>% bind_rows()
}

cordat.list <- lapply(eset.list[c(1,2)], get_cor_dat, tbnk_mat, method = "spearman")

reorder_levels <- function(dat){
  cormat <- dat %>% 
          select(cor, feature_x, feature_y) %>%
          spread(key = feature_y, value = cor) %>%
          `rownames<-`(.$feature_x) %>%
          select(-feature_x) %>%
          as.matrix()

  hc_row <- cormat %>%
          dist() %>% hclust()
  dat$feature_x <- factor(dat$feature_x, levels = hc_row$labels[hc_row$order])

  hc_col <- cormat %>%
          t() %>%
          dist() %>% hclust()
  dat$feature_y <- factor(dat$feature_y, levels = hc_col$labels[hc_col$order])

  dat
}

cordat.list <- lapply(cordat.list, reorder_levels)

cordat.list <- lapply(cordat.list, function(dat){
  levels(dat$feature_x) <- replace_tbnk_names(levels(dat$feature_x))
  dat$feat_group <- tbnk_groups(dat$feature_x, "new name")

  dat
})

add_signif <- function(dat, method = "fdr", cutoff = .05){
  dat <- dat %>%
          mutate(p.adj = p.adjust(p, method = "fdr")) %>%
          mutate(asterisk = ifelse(p.adj < cutoff, "*", ""))
  dat
}

cordat.list <- lapply(cordat.list, add_signif)

pdf(FIG.OUT.PATH, height = 6, width = 4)
p <- ggplot(cordat.list[[1]] , aes(x = feature_y, y = feature_x)) + geom_tile(aes(fill = cor)) +
  #scale_radius(limits = c(0,1)) + 
  scale_fill_gradient2(low = "blue", mid = "white", 
                        high = "red", limits = c(-1, 1)) + 
  geom_text(aes(label = asterisk), color = "black") +
  facet_grid(feat_group~"PM", scales = "free_y", space = "free") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
        axis.line = element_blank()) +
        #strip.text.x = element_blank(), 
        #strip.background.x = element_blank()) +
  xlab("") + ylab("")
print(p)

p <- ggplot(cordat.list[[2]] , aes(x = feature_y, y = feature_x)) + geom_tile(aes(fill = cor)) +
  #scale_radius(limits = c(0,1)) + 
  scale_fill_gradient2(low = "blue", mid = "white", 
                        high = "red", limits = c(-1, 1)) + 
  geom_text(aes(label = asterisk), color = "black") +
  theme_bw() +
  facet_grid(feat_group~"TM", scales = "free_y", space = "free") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
        axis.line = element_blank()) + 
        #strip.text.x = element_blank(), 
        #strip.background.x = element_blank()) +
  xlab("") + ylab("")

print(p)
dev.off()

R ggplot2 tidyverse gridExtra From line 1 of Supplemental_Figure_1/tbnk_module_cor.R

library(ggplot2)
library(gridExtra)
library(ggrepel)
library(ggpubr)
library(dplyr)
library(tidyr)
library(reshape2)

#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")

# Source utilities
source('scripts/util/Plotting/plot_auc.R')
source('scripts/util/paper/abbrev_cond.R')

RF.META.IN.PATHS = list(
  CGD = snakemake@input[["cgd_meta"]],#'Classification/cgd_random_forest_sample_meta_data_all.RDS',
  STAT1.GOF = snakemake@input[["stat1_meta"]],#'Classification/stat1_random_forest_sample_meta_data_all.RDS',
  FMF = snakemake@input[["fmf_meta"]],#'Classification/fmf_random_forest_sample_meta_data_all.RDS',
  Job = snakemake@input[["job_meta"]]#'Classification/job_random_forest_sample_meta_data_all.RDS'
)


## The LOO CV results for each patient for each condition-based random forest classifier
HI.CONDITION.IN.PATHS = list(
  CGD = snakemake@input[["cgd_res"]],#'Classification/results/cgd_rf_results_all.RDS',
  STAT1.GOF = snakemake@input[["stat1_res"]],#'Classification/results/stat1_rf_results_all.RDS',
  FMF = snakemake@input[["fmf_res"]],#'Classification/results/fmf_rf_results_all.RDS',
  Job = snakemake@input[["job_res"]]#'Classification/results/job_rf_results_all.RDS'
)


## The condition-based random forest classifier results for the pvalue of the GVI of each feature via permutation testing
HI.CONDITION.PVALS.IN.PATH = list(
  CGD = snakemake@input[["cgd_pvals"]],#'Classification/results/cgd_rf_pvals_all.RDS',
  STAT1.GOF = snakemake@input[["stat1_pvals"]],#'Classification/results/stat1_rf_pvals_all.RDS',
  FMF = snakemake@input[["fmf_pvals"]],#'Classification/results/fmf_rf_pvals_all.RDS',
  Job = snakemake@input[["job_pvals"]]#'Classification/results/job_rf_pvals_all.RDS'
)


## The condition-based random forest classifier results for the pvalue of the GVI of each feature via permutation testing
HI.CONDITION.GVIS.IN.PATH = list(
  CGD = snakemake@input[["cgd_gvis"]],#'Classification/results/cgd_rf_gvis_all.RDS',
  STAT1.GOF = snakemake@input[["stat1_gvis"]],#'Classification/results/stat1_rf_gvis_all.RDS',
  FMF = snakemake@input[["fmf_gvis"]],#'Classification/results/fmf_rf_gvis_all.RDS',
  Job = snakemake@input[["job_gvis"]]#'Classification/results/job_rf_gvis_all.RDS'
)

AUC.FIG.OUT.PATH <- snakemake@output[["auc_fig"]]#"Paper_1_Figures/Supplemental_Figure_2/condition_classifier_auc.pdf"
PVAL.FIG.OUT.PATH <- snakemake@output[["pval_fig"]]

# Supplemental Figure 4f -- condition-specific classifiers
results = lapply(HI.CONDITION.IN.PATHS, readRDS) ## Extract the condition-specific classifier results 

gvis = lapply(HI.CONDITION.GVIS.IN.PATH, readRDS) 

metas = lapply(RF.META.IN.PATHS, readRDS) ## Extact the meta data assocaited with each condition-specific classifier

## List the condition groups for each classifier
condition.groups = list(CGD = c('XCGD', '47CGD'),
                        STAT1.GOF = 'STAT1 GOF',
                        FMF = 'FMF',
                        Job = 'Job')

## Create a name conversion map to make the data types underlying each classifier more clear
conversion = c("microarray.modules" = 'Gene modules', 
               "tbnks" = 'CBCs + Lymphocyte Phenotyping',
               "cbcs" = 'CBCs',
               "somalogic.modules" = 'Protein modules', 
               "all.modules.with.tbnks" = 'Modules + CBCs', 
               "all.modules.plus.grey.with.tbnks" = 'Modules + CBCs + Grey Proteins')

## Insantiate a function to get the AUC associated with each classifier and each condition
get_aucs = function(result, meta, condition.group) {
  ## Get the condition associated with each patient
  conditions = meta[rownames(result), 'condition']
  apply(result, 2, function(x) {
    ## Get the ROC curve associated with each classifier
    roc = get_roc(x = x, y = conditions, pos = condition.group)
    ## Get the AUC of that ROC curve
    get_auc(roc)
  })
}

## Run the function on each of the condition-specific classifier results (and simplify into a matrix)
aucs = mapply(get_aucs, results, metas, condition.groups, SIMPLIFY = T)

## Create a data frame holding the AUCs for each classifier, and melt it
df = as.data.frame(aucs) %>% 
  tibble::rownames_to_column(var = 'classifier') %>%
  mutate(classifier = conversion[classifier]) %>%
  mutate(classifier = factor(classifier, levels = conversion)) %>%
  melt()

## Create grouped barplots for each classifier and each condition
p = df %>% filter(classifier =="Modules + CBCs + Grey Proteins") %>%
        ggplot(aes(x = variable, y = value)) + 
  geom_bar(stat = 'identity') +
  theme_bw() + labs(fill = 'Classifier') + xlab('Condition') + ylab('AUC')

ggsave(AUC.FIG.OUT.PATH, p, device = 'pdf', height = 2, width = 3)


gvis_dat <- lapply(gvis, `[[`, "all.modules.plus.grey.with.tbnks" ) %>%
        lapply(tibble::enframe, name = "feature", value = "gvi") %>%
        bind_rows(.id = "condition") %>%
        mutate(feature = factor(feature), condition = factor(condition))

# Supplemental Figure 4g -- heatmap of gvis
gvi.pvals = lapply(HI.CONDITION.PVALS.IN.PATH, readRDS) ## Get the GVI pvalues associated with each classifier and each condition
pvals = sapply(gvi.pvals, function(x) {x$all.modules.plus.grey.with.tbnks}) ## Extract the pvalues for the features in the classifier with all data types
pvals = as.data.frame(pvals) ## Orangize this matrix into a data frame

## Get the top 5 features from each condition's classifier
top_features = lapply(colnames(pvals), function(group) {
  x = rownames(pvals)[order(pvals[[group]], decreasing = FALSE)]
  x = x[1:5]
})
top_features = unique(unlist(top_features))

## Adjust the pvalues using BH correction within each classifer, and get the negative log 10 adjusted pvalues
pvals = apply(pvals, 2, function(x) {
  x = p.adjust(x, 'fdr')
  x = -log10(x)
})

## Subject to just the top features
pvals = pvals[top_features, ]

## Create an index to associate each feature and each condition with a row and column
n = nrow(pvals)
m = ncol(pvals)
xs = t(matrix(1:m, nrow = m, ncol = n))
ys = matrix(1:n, nrow = m, ncol = n)


## Put the pvalue results, x-indexes, and y-indexes into a data frame
df = data.frame(x = xs[1:(n*m)], y = ys[1:(n*m)], NLP = pvals[1:(n*m)])
df$x = factor(df$x)
levels(df$x) = colnames(pvals)
df$y = factor(df$y)
levels(df$y) = rownames(pvals)

df <- df %>% rename(condition = x, feature = y)

df <- left_join(df, gvis_dat)

df <- df %>%
        mutate(condition = factor(condition, levels = colnames(pvals))) %>%
        mutate(feature = factor(feature, levels = rownames(pvals)))

levels(df$feature) <- levels(df$feature) %>%
        gsub(pattern = "somalogic\\.grey\\.", replacement = "", .) %>%
        gsub(pattern = "microarray\\.modules\\.red", replacement = "TM1 : Inteferon", .)

## And plot the associated heatmap using the ggplot tile function
p = ggplot(df, aes(x = condition, y = feature)) + 
        geom_point(aes(size = NLP, color = gvi)) +
        scale_color_viridis_c() +
        theme_bw() + 
  #xlab('Condition') + ylab('Feature') + 
  labs(size = '-log10(pvalue)') +
  theme(axis.text.x = element_text(angle = 30, vjust = 0.5, hjust=1))
ggsave(PVAL.FIG.OUT.PATH, p, device = 'pdf', height = 4, width = 5)

R ggplot2 dplyr tidyr ggrepel ggpubr reshape2 gridExtra From line 2 of Supplemental_Figure_2/supplemental_figure_2_other_classifiers.R

library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(ggfortify)
library(ggrepel)
library(Biobase)

#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")

source("scripts/util/paper/abbrev_cond.R")
source("scripts/util/Plotting/tbnk_featurename_replace.R")

TBNK.ESET.IN.PATH <- snakemake@input[[1]]#"Data/TBNK/data_analysis_ready/tbnk_training_subject_level_eset.rds"
eset <- readRDS(TBNK.ESET.IN.PATH)

HEATMAP.OUT.PATH <- snakemake@output[["heatmap"]]#"Paper_1_Figures/Supplemental_Figure_2/tbnk_heatmap.pdf"
PCA.OUT.PATH <- snakemake@output[["pca"]]#"Paper_1_Figures/Supplemental_Figure_2/tbnk_pca.pdf"
#keep.features <- c(
#  "cd4_cd3_count","nk_cells_count","cd3_count","cd8_cd3_count",
#  "cd19_count","wbc","rbc",
#  "hemoglobin","mcv",
#  "mch","rdw",
#  "platelet_count","neutrophil_abs",
#  "lymphocytes_abs","monocytes_abs","eosinophil_abs",
#  "basophil_abs"
#)

mat <- exprs(eset[1:18, ]) %>% t()
pr_obj <- prcomp(mat, scale = TRUE, center = TRUE)


meta <- pData(eset) %>%
  mutate(cond.abbrev = abbrev_cond(condition)) %>%
  mutate(cond.grouped = group_cond(condition)) %>%
  mutate(PC1 = pr_obj$x[, "PC1"],
         PC2 = pr_obj$x[, "PC2"])

centroids <- meta %>% group_by(condition) %>%
        summarise(mean_pc1 = mean(PC1), mean_pc2 = mean(PC2),
        n_subj = n())

meta <- left_join(meta, centroids)

centroid_sub <- meta %>% 
        filter(n_subj > 3) %>%
        select(mean_pc1, mean_pc2, cond.grouped, cond.abbrev) %>% distinct()

pca.plot.points <- 
  ggplot(meta, aes(x = PC1, y = PC2, color = cond.grouped)) +
  geom_text(aes(label = cond.abbrev), size = 2) +
  geom_point(data = centroid_sub, aes(x=mean_pc1, y=mean_pc2),size=5)+
  geom_segment(aes(x=mean_pc1, y=mean_pc2, xend=PC1, yend=PC2), alpha = .2) +
  geom_text_repel(data = centroid_sub, aes(x=mean_pc1, y=mean_pc2, label = cond.abbrev),size=5)+
  theme_bw() +
  theme(legend.position = "none")

#pca.plot.age <- 
#  #ggplot(meta, aes(x = PC1, y = PC2, color = cond.abbrev)) +
#  ggplot(meta, aes(x = PC1, y = PC2)) +
#  geom_text(aes(label = cond.abbrev, color = Age), size = 2) +
#  #geom_point(aes(x=mean_pc1, y=mean_pc2),size=5)+
#  #geom_segment(aes(x=mean_pc1, y=mean_pc2, xend=PC1, yend=PC2), alpha = .2) +
#  #geom_text_repel(data = centroid_sub, aes(x=mean_pc1, y=mean_pc2, label = cond.abbrev),size=5)+
#  theme_bw()

#file.remove("Paper_1_Figures/Supplemental_Figure_2/tbnk_pca.pdf")
pdf(PCA.OUT.PATH, height = 5, width = 5.5)
print(pca.plot.points)
#print(pca.plot.age)
dev.off()

#Heatmap showing Z-score of CBC parameters

#The big blue group clustering together is Job; they have high eosinophils


condition <- abbrev_cond(eset$condition)
conditions <- table(condition)
large.conditions <- conditions[table(condition) > 10]
large.condition <- condition
large.condition[!large.condition %in% names(large.conditions)] <- "Other"
large.condition <- factor(large.condition)

#All subjects

#cond <- factor(eset$condition)
annotation <- data.frame(All_groups = condition, Major_groups = large.condition, age = eset$Age)
rownames(annotation) <- colnames(eset)

breaksList = seq(-3, 3, by = .01)
pdf(HEATMAP.OUT.PATH, height = 9, width = 14)
pheatmap(exprs(eset) %>% t %>% scale %>% t %>% `rownames<-`(replace_tbnk_names(rownames(.))),
         color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(length(breaksList)),
         breaks = breaksList,
         show_colnames = FALSE, 
         annotation_col = annotation)
dev.off()

R ggplot2 tidyverse ggrepel RColorBrewer pheatmap Biobase ggfortify From line 1 of Supplemental_Figure_2/tbnk_heatmap.R

library(tidyverse)
library(cowplot)
library(ggraph)
library(tidygraph)
library(ggpubr)
library(ggrepel)

#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")

JIVE.PC.IN.PATH <- snakemake@input[["jive_pcs"]]#"Integration_output/jive/subject/prcomp_list.rds"
JIVE.IN.PATH <- snakemake@input[["jive"]]#"Integration_output/jive/subject/jive.rds"

BOXPLOT.OUT.PATH <- snakemake@output[["boxplots"]]#"Paper_1_Figures/Supplemental_Figure_3/jive_array_indiv_boxplots.pdf"
SCATTER.OUT.PATH <- snakemake@output[["scatter"]]#"Paper_1_Figures/Supplemental_Figure_3/jive_array_indiv_scatter_w_centroids.pdf"
source("scripts/util/paper/abbrev_cond.R")

prcomp.list <- readRDS(JIVE.PC.IN.PATH)
jive <- readRDS(JIVE.IN.PATH)
pdat <- jive$pdat
array_indiv <- prcomp.list$array.ind$x

stopifnot(identical(rownames(array_indiv), pdat$patient_id))

array_indiv <- array_indiv %>% 
  as.data.frame() %>%
  bind_cols(pdat) %>% 
  mutate(cond.abbrev = abbrev_cond(condition)) %>%
  mutate(cond.grouped = group_cond(condition))


#Inspired by this post
#https://stackoverflow.com/questions/23463324/r-add-centroids-to-scatter-plot

array_indiv_centroids <- array_indiv %>% group_by(condition) %>%
        summarise(mean_pc1 = mean(PC1), mean_pc2 = mean(PC2),n_subj = n())

array_indiv <- left_join(array_indiv, array_indiv_centroids)

array_indiv_sub <- array_indiv %>% 
        filter(n_subj > 3) %>%
        select(mean_pc1, mean_pc2, cond.abbrev, cond.grouped) %>% distinct()

pca.plot.points <- 
  ggplot(array_indiv, aes(x = PC1, y = PC2, color = cond.grouped)) +
  #geom_point( size = 3) +
  geom_text(aes(label = cond.abbrev), size = 2) +
  geom_point(data = array_indiv_sub, aes(x=mean_pc1, y=mean_pc2),size=5)+
  geom_segment(aes(x=mean_pc1, y=mean_pc2, xend=PC1, yend=PC2), alpha = .2) +
  #geom_text_repel(data = array_indiv_sub, aes(x=mean_pc1, y=mean_pc2, label = cond.abbrev),size=5, color = "black")+
  geom_text_repel(data = array_indiv_sub, aes(x=mean_pc1, y=mean_pc2, label = cond.abbrev),size=5)+
  theme_bw() +
  theme(legend.position = "none") #+
  #theme(axis.title.y = element_blank(), axis.title.x = element_blank()) +
  #theme(plot.margin = unit(c(0, 0, 0, 0), "cm"))

pdf(SCATTER.OUT.PATH, height = 5, width = 6)
print(pca.plot.points)
dev.off()

pc.medians <-
  array_indiv %>%
  group_by(cond.abbrev) %>%
  summarise(pc1.median = median(PC1), pc2.median = median(PC2))


pc1.order <- pc.medians$cond.abbrev[order(pc.medians$pc1.median)]
pc1.order <- c("Healthy", pc1.order[pc1.order != "Healthy"])
array_indiv$cond.abbrev <- factor(array_indiv$cond.abbrev, levels = pc1.order)

pc1.box <- 
  ggplot(array_indiv, aes(x = cond.abbrev, y = PC1)) +
  geom_boxplot(outlier.shape = NA, aes(fill = cond.grouped)) +
  ggbeeswarm::geom_beeswarm(size = .8, alpha = .4)+
  theme_bw() +
  stat_compare_means(ref.group = "Healthy", hide.ns = TRUE, label = "p.signif", color = "red") +
  coord_flip() + 
  geom_vline(xintercept = 1.5) +
  theme(legend.position = "none") +
  xlab("Condition") +
  ylab("iPC1")


pc2.order <- pc.medians$cond.abbrev[order(pc.medians$pc2.median)]
pc2.order <- c("Healthy", pc2.order[pc2.order != "Healthy"])
array_indiv$cond.abbrev <- factor(array_indiv$cond.abbrev, levels = pc2.order)

pc2.box <- 
  ggplot(array_indiv, aes(x = cond.abbrev, y = PC2)) +
  geom_boxplot(outlier.shape = NA, aes(fill = cond.grouped)) +
  ggbeeswarm::geom_beeswarm(size = .8, alpha = .4)+
  theme_bw() +
  #theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  coord_flip() + 
  stat_compare_means(ref.group = "Healthy", hide.ns = TRUE, label = "p.signif", color = "red") +
  theme(legend.position = "none") +
  geom_vline(xintercept = 1.5) +
  xlab("Condition") +
  ylab("iPC2")

pdf(BOXPLOT.OUT.PATH, height = 3, width = 4)
print(pc1.box)
print(pc2.box)
dev.off()

R ggplot2 tidyverse cowplot ggrepel ggpubr ggraph tidygraph From line 1 of Supplemental_Figure_3/array_individual_scatter_and_boxplots.R

library(ggplot2)
library(dplyr)
library(ggpubr)
library(ggbeeswarm)

if(!exists("snakemake")){
  setwd("../../..")
  source("scripts/util/paper/parse_snakemake.R")
  parse_snakemake(rule = "supplemental_figure_3_cgd_jpc1")
}

JIVE.PC.IN.PATH <- snakemake@input[["jive_pcs"]]
JIVE.IN.PATH <- snakemake@input[["jive"]]

FIG.OUT.PATH <- snakemake@output[[1]]

prcomp.list <- readRDS(JIVE.PC.IN.PATH)
jive <- readRDS(JIVE.IN.PATH)
pdat <- jive$pdat
joint <- prcomp.list$joint$x

stopifnot(identical(rownames(joint), pdat$patient_id))

joint <- joint %>% 
  as.data.frame() %>%
  bind_cols(pdat)

joint <- joint %>% filter(grepl("CGD", condition))

pdf(FIG.OUT.PATH, height = 3, width =3)
p <- ggplot(joint, aes(x = condition, y = PC1)) +
        geom_boxplot(outlier.shape = NA) +
        geom_beeswarm() +
        stat_compare_means() +
        ylab("jPC1") + xlab("")
print(p)


p <- ggplot(joint, aes(x = condition, y = PC1)) +
        geom_boxplot(outlier.shape = NA) +
        geom_beeswarm() +
        stat_compare_means(method = "t.test") +
        ylab("jPC1") + xlab("")
print(p)
dev.off()

R Snakemake ggplot2 dplyr ggpubr ggbeeswarm From line 1 of Supplemental_Figure_3/cgd_jpc1.R

suppressPackageStartupMessages({
  library(ggplot2)
  library(pheatmap)
  library(tidyverse)
  library(ggpubr)
})


#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")

source("scripts/util/Processing/averageRepeatSamples.R")
source("scripts/util/paper/abbrev_cond.R")

if(!exists("snakemake")){
  setwd("../../..")
  source("scripts/util/paper/parse_snakemake.R")
  parse_snakemake(rule = "supplemental_figure_3_pc2_leuko_composite")
}

JIVE.PATH <- snakemake@input[["jive"]]#"Integration_output/jive/subject/jive.rds"
JIVE.PC.PATH <- snakemake@input[["jive_pcs"]]#"Integration_output/jive/subject/prcomp_list.rds"
TBNK.PATH <- snakemake@input[["tbnk"]]#"Data/TBNK/data_analysis_ready/tbnk_training_subject_level_eset.rds"

PLOT_OUT_PATH <- snakemake@output[[1]]#"Paper_1_Figures/Supplemental_Figure_3/leuko_composite_pc2_cor.pdf"
MARROW_PLOT_OUT_PATH <- snakemake@output[[2]]#"Paper_1_Figures/Supplemental_Figure_3/leuko_composite_pc2_cor.pdf"
PLOT_SEPARATE_OUT_PATH <- snakemake@output[[3]]#"Paper_1_Figures/Supplemental_Figure_3/leuko_composite_separate_pc2_cor.pdf"

tbnk.eset <- readRDS(TBNK.PATH)

prcomp.list <- readRDS(JIVE.PC.PATH)
joint <- prcomp.list$joint$x

jive <- readRDS(JIVE.PATH)
pdat <- jive$pdat

intersection <- intersect(rownames(joint), tbnk.eset$patient_id)
tbnk.mat <- exprs(tbnk.eset)
tbnk.mat <- tbnk.mat[,match(intersection, tbnk.eset$patient_id)]
tbnk.mat <- t(tbnk.mat)

joint <- joint[match(intersection, rownames(joint)),]

stopifnot(all.equal(rownames(tbnk.mat), rownames(joint)))
pdat <- pdat %>% filter(patient_id %in% rownames(joint))

#Select lymphocytes, monocytes, neutrophils absolute counts

keep.cells <- c("neutrophil_abs", "monocytes_abs", "lymphocytes_abs")
tbnk.mat <- tbnk.mat[, keep.cells]

# Make everything z score for healthy mean and sd

healthy.means <- apply(tbnk.mat[pdat$condition == "Healthy",], 2, mean)
healthy.sd <- apply(tbnk.mat[pdat$condition == "Healthy",], 2, sd)

tbnk.z <- tbnk.mat
for(i in seq_len(ncol(tbnk.z))){
  tbnk.z[, i] <- (tbnk.z[, i] - healthy.means[[i]]) / healthy.means[[i]]
}

#Create composite score

#Average of the Z-scores
tbnk.composite <- apply(tbnk.z, 1, mean)

dat <- pdat %>%
  mutate(composite.score = tbnk.composite) %>%
  mutate(PC2 = joint[, "PC2"]) %>%
  mutate(cond.grouped = group_cond(condition)) %>%
  mutate(cond.abbrev = abbrev_cond(condition))

#dat %>% 
#  group_by(condition) %>%
#  summarise(pc2.med = median(PC2)) %>% 
#  arrange(pc2.med)

#These are the conditions that will be included in the scatter of PC2 vs composite score when show each condition in facets


conditions.of.interest <- c("Healthy", "DADA2", "GATA2", "CTLA4", "PGM3", "PI3K", "TERC", "TERT")


#Add column that can be used to select conditions of interest and add annotation that groups the Terts and Tercs


dat <- 
  dat %>%
  mutate(condition2 = replace(condition, which(!condition %in% conditions.of.interest), "other")) %>% 
  mutate(condition2 = replace(condition2, condition %in% c("TERT", "TERC"), "TERT/TERC"))


#Plot across everyone

pdf(PLOT_OUT_PATH, height =5, width = 5)
ggplot(dat, aes(x = composite.score, y = PC2)) + 
  geom_text(aes(color = cond.abbrev, label = cond.abbrev), size = 2) + 
  ylab("jPC2") +
  stat_cor(method = "spearman") +
  theme_bw() +
  theme(legend.position = "none")
dev.off()


#2nd plot. just gata2
#patients with marrow issues

#From Rachel in Teams 2021-08-06 4:41 PM. Personal chat
#| P129 | normal marrow |
#| P164 | normal marrow |
#| P182 | normal marrow |
#| P150 | normal marrow |
#| P97  | mild G2BMD    |
#| P101 | mild G2BMD    |
#| P168 | MDS           |
#| P86  | MDS           |
#| P166 | MDS           |

normal_pats <- c("P129", "P164", "P182", "P150")
mild_pats <- c("P97", "P101")
mds_pats <- c("P168", "P86", "P166")

gata2_dat <- dat %>% 
        filter(condition == "GATA2") %>%
        mutate(marrow_status = NA) %>%
        mutate(marrow_status = replace(marrow_status, patient_id %in% normal_pats, "normal")) %>%
        mutate(marrow_status = replace(marrow_status, patient_id %in% mild_pats, "mild G2BMD")) %>%
        mutate(marrow_status = replace(marrow_status, patient_id %in% mds_pats, "MDS")) %>%
        mutate(marrow_status = factor(marrow_status, levels = c("normal", "mild G2BMD", "MDS")))

p <- ggplot(gata2_dat, aes(x = composite.score, y = PC2)) +
        geom_point(aes(shape = marrow_status, color = marrow_status)) +
        scale_color_manual(values = c("black", "orange", "red")) +
        labs(color = "Marrow Status", shape = "Marrow Status") +
        theme_bw() +
        facet_wrap(~"GATA2")

ggsave(plot = p, filename = MARROW_PLOT_OUT_PATH, height = 2, width = 3.5)

#Plot by condition with p values
p <- dat %>%
  filter(condition %in% conditions.of.interest) %>%
  ggplot(aes(x = composite.score, y = PC2)) + 
  geom_point(aes(color = cond.abbrev)) + 
  ylab("jPC2") +
  stat_cor(method = "spearman") + 
  facet_wrap(~condition2, nrow = 4) + 
  theme_bw() +
  theme(legend.position = "none")
ggsave(plot = p, filename = PLOT_SEPARATE_OUT_PATH, width = 4)

R Snakemake ggplot2 tidyverse ggpubr From line 1 of Supplemental_Figure_3/cor_PC2_leuko_composite.R

library(tidyverse)

#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")
DAT.IN.PATH <- snakemake@input[[1]]#"Integration_output/jive/subject/pc_enrich_dat_camera.rds"

FIG.OUT.PATH <- snakemake@output[[1]]#"Paper_1_Figures/Supplemental_Figure_3/jive_pc_enrichments_joint_down_pc1.pdf"

all.dat <- readRDS(DAT.IN.PATH)
all.dat <- all.dat %>% filter(geneset.db != "tiss.general")
all.dat$col <- as.numeric(as.factor(all.dat$geneset.db))

plot_single <- function(dat, 
                        n.genesets = 20,
                        main = ""
                        ){
  #filter to the top ranked by FDR for that comparison
  selection2 <- order(dat[["FDR_all_db"]])[seq_len(n.genesets)]
  dat <- dat[rev(selection2), ]

  dat <- dat %>%
          mutate(geneset = paste(geneset.db, geneset, sep = "_"))

  #geneset_vec <- strsplit(dat$geneset, split = "_")

  #nwords_in_geneset_name <- sapply(geneset_vec, length)

  #for(i in seq_along(geneset_vec)){
  #  #print(nwords_in_geneset_name[[i]])
  #  if(nwords_in_geneset_name[[i]] > 6){
  #        geneset_vec[[i]] <- append(geneset_vec[[i]], "\n", after = 6)
  #      }
  #  geneset_vec[[i]] <- paste(geneset_vec[[i]], collapse = "_")
  #}

  #dat$geneset <- geneset_vec

  #if(nwords_in_geneset_name)

  geneset_levels <- dat$geneset[order(dat[["FDR_all_db"]])]

  dat <- dat %>%
          mutate(geneset = factor(geneset, levels = geneset_levels))

  p <- ggplot(dat, aes(y = -log10(FDR_all_db), x = geneset)) + 
          geom_col() +
          theme_bw() +
          xlab("") +
          coord_flip() +
          ggtitle(main)
  return(p)
}


all.dat <- all.dat %>%
        group_by(pca.data, in.data, PC, Direction) %>%
        mutate(FDR_all_db = p.adjust(PValue, method = "fdr"))


plot_list <- list()
i <- 0
for(indata in c("array", "soma")){
  for(PC. in c("PC1")){
    for(direct in c("Down")){

      p <- all.dat %>%
              filter(pca.data == "joint", in.data == indata, 
                     PC == PC., Direction == direct) %>% 
              plot_single(main = paste("joint", indata, PC., direct), n.genesets  = 20)

      i <- i + 1
      plot_list[[i]] <- p

    }
  }
}
library(cowplot)
p_all <- plot_grid(plotlist = plot_list, ncol =2, nrow = 1)

pdf(FIG.OUT.PATH, 
    height = 3.5, width = 11)
print(p_all)

dev.off()

R ggplot2 tidyverse cowplot From line 1 of Supplemental_Figure_3/jive_pca_enrich.R

library(tidyverse)
library(cowplot)
library(ggraph)
library(tidygraph)
library(ggpubr)
library(ggrepel)

#setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")

JIVE.PC.IN.PATH <- snakemake@input[["jive_pcs"]]#"Integration_output/jive/subject/prcomp_list.rds"
JIVE.IN.PATH <- snakemake@input[["jive"]]#"Integration_output/jive/subject/jive.rds"

BOXPLOT.OUT.PATH <- snakemake@output[["boxplots"]]#"Paper_1_Figures/Supplemental_Figure_3/jive_soma_indiv_boxplots.pdf"
SCATTER.OUT.PATH <- snakemake@output[["scatter"]]#"Paper_1_Figures/Supplemental_Figure_3/jive_soma_indiv_scatter_w_centroids.pdf"
source("scripts/util/paper/abbrev_cond.R")

prcomp.list <- readRDS(JIVE.PC.IN.PATH)
jive <- readRDS(JIVE.IN.PATH)
pdat <- jive$pdat
soma_indiv <- prcomp.list$soma.ind$x

stopifnot(identical(rownames(soma_indiv), pdat$patient_id))

soma_indiv <- soma_indiv %>% 
  as.data.frame() %>%
  bind_cols(pdat) %>% 
  mutate(cond.abbrev = abbrev_cond(condition)) %>%
  mutate(cond.grouped = group_cond(condition))


#Inspired by this post
#https://stackoverflow.com/questions/23463324/r-add-centroids-to-scatter-plot

soma_indiv_centroids <- soma_indiv %>% group_by(condition) %>%
        summarise(mean_pc1 = mean(PC1), mean_pc2 = mean(PC2),n_subj = n())

soma_indiv <- left_join(soma_indiv, soma_indiv_centroids)

soma_indiv_sub <- soma_indiv %>% 
        filter(n_subj > 3) %>%
        select(mean_pc1, mean_pc2, cond.abbrev, cond.grouped) %>% distinct()

pca.plot.points <- 
  ggplot(soma_indiv, aes(x = PC1, y = PC2, color = cond.grouped)) +
  #geom_point( size = 3) +
  geom_text(aes(label = cond.abbrev), size = 2) +
  geom_point(data = soma_indiv_sub, aes(x=mean_pc1, y=mean_pc2),size=5)+
  geom_segment(aes(x=mean_pc1, y=mean_pc2, xend=PC1, yend=PC2), alpha = .2) +
  #geom_text_repel(data = soma_indiv_sub, aes(x=mean_pc1, y=mean_pc2, label = cond.abbrev),size=5, color = "black")+
  geom_text_repel(data = soma_indiv_sub, aes(x=mean_pc1, y=mean_pc2, label = cond.abbrev),size=5)+
  theme_bw() +
  theme(legend.position = "none") #+
  #theme(axis.title.y = element_blank(), axis.title.x = element_blank()) +
  #theme(plot.margin = unit(c(0, 0, 0, 0), "cm"))

pdf(SCATTER.OUT.PATH, height = 5, width = 6)
print(pca.plot.points)
dev.off()

pc.medians <-
  soma_indiv %>%
  group_by(cond.abbrev) %>%
  summarise(pc1.median = median(PC1), pc2.median = median(PC2))


pc1.order <- pc.medians$cond.abbrev[order(pc.medians$pc1.median)]
pc1.order <- c("Healthy", pc1.order[pc1.order != "Healthy"])
soma_indiv$cond.abbrev <- factor(soma_indiv$cond.abbrev, levels = pc1.order)

pc1.box <- 
  ggplot(soma_indiv, aes(x = cond.abbrev, y = PC1)) +
  geom_boxplot(outlier.shape = NA, aes(fill = cond.grouped)) +
  ggbeeswarm::geom_beeswarm(size = .8, alpha = .4)+
  theme_bw() +
  stat_compare_means(ref.group = "Healthy", hide.ns = TRUE, label = "p.signif", color = "red") +
  coord_flip() + 
  geom_vline(xintercept = 1.5) +
  theme(legend.position = "none") +
  xlab("Condition") +
  ylab("iPC1")


pc2.order <- pc.medians$cond.abbrev[order(pc.medians$pc2.median)]
pc2.order <- c("Healthy", pc2.order[pc2.order != "Healthy"])
soma_indiv$cond.abbrev <- factor(soma_indiv$cond.abbrev, levels = pc2.order)

pc2.box <- 
  ggplot(soma_indiv, aes(x = cond.abbrev, y = PC2)) +
  geom_boxplot(outlier.shape = NA, aes(fill = cond.grouped)) +
  ggbeeswarm::geom_beeswarm(size = .8, alpha = .4)+
  theme_bw() +
  #theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  coord_flip() + 
  stat_compare_means(ref.group = "Healthy", hide.ns = TRUE, label = "p.signif", color = "red") +
  theme(legend.position = "none") +
  geom_vline(xintercept = 1.5) +
  xlab("Condition") +
  ylab("iPC2")

pdf(BOXPLOT.OUT.PATH, height = 3, width = 4)
print(pc1.box)
print(pc2.box)
dev.off()

R ggplot2 tidyverse cowplot ggrepel ggpubr ggraph tidygraph From line 1 of Supplemental_Figure_3/soma_individual_scatter_and_boxplots.R

library(ggplot2)
library(dplyr)
library(tidyr)
source('scripts/util/paper/abbrev_cond.R')
source('scripts/util/Groups/groups.R')
source('scripts/util/Plotting/plot_auc.R')

# Set paths
## The healthy index using AI as a background
AI.BASED.HI.IN.PATH = snakemake@input[[1]] #'Classification/results/healthy_rf_results_AI.RDS'
## The healthy index feature GVI pvalues from permutation testing using AI as a background
AI.BASED.HI.PVALS.IN.PATH = snakemake@input[[2]] #'Classification/results/healthy_rf_pvals_AI.RDS'
## The AI and healthy sample meta data
AI.BASED.HI.META.IN.PATH = snakemake@input[[3]] #'Classification/healthy_random_forest_sample_meta_data_AI.RDS'
## The healthy index using PID as a background
PID.BASED.HI.IN.PATH = snakemake@input[[4]] #'Classification/results/healthy_rf_results_PID.RDS'
## The healthy index feature GVI pvalues from permutation testing using PID as a background
PID.BASED.HI.META.IN.PATH = snakemake@input[[5]] #'Classification/healthy_random_forest_sample_meta_data_PID.RDS'
## The PID and healthy sample meta data
PID.BASED.HI.PVALS.IN.PATH = snakemake@input[[6]] #'Classification/results/healthy_rf_pvals_PID.RDS'
## The healthy index using all subjects as a background
ALL.BASED.HI.IN.PATH = snakemake@input[[7]] #'Classification/results/healthy_rf_results_all.RDS'
## The healthy index feature GVI pvalues from permutation testing using all conditions as a background
ALL.BASED.HI.META.IN.PATH = snakemake@input[[8]] #'Classification/healthy_random_forest_sample_meta_data_all.RDS'
## The all conditiion and healthy sample meta data
ALL.BASED.HI.PVALS.IN.PATH = snakemake@input[[9]] #'Classification/results/healthy_rf_pvals_all.RDS'

## The PID predictions after training the classifier on just AI and Healthy
PID.PREDICTIONS.FROM.AI.IN.PATH = snakemake@input[[10]] #'Classification/predictions/healthy_rf_PID_predictions_using_AI_index.RDS'
## The AI predictions after training the classifier on just PID and Healthy
AI.PREDICTIONS.FROM.PID.IN.PATH = snakemake@input[[11]] #'Classification/predictions/healthy_rf_AI_predictions_using_PID_index.RDS'

## The figure out path
PDF.OUT.PATH = snakemake@output[[1]] #'Paper_1_Figures/Supplemental_Figure_4/figure_4_AI_and_PID_HI_addendum.pdf'

# Get group constituents
AI = util.get_ai()
PID = util.get_pid()
Telo = util.get_tert_terc()

# Instantiate a function for creating AUC plots
make_auc_plot = function(results, meta, title) {
  result = results$all.modules.plus.grey.with.tbnks
  roc = get_roc(result, meta$condition, 'Healthy')
  auc = get_auc(roc)

  auc = format(auc, digits = 2)

  p = ggplot(roc, aes(x = fpr, y = tpr)) + geom_line(color = 'black', show.legend = FALSE) + 
    geom_abline(slope = 1, linetype = 'dashed', color = 'grey') + 
    theme_bw() + geom_text(aes(x = .75, y = .25), size = 4, label = paste0('AUC: ', auc), show.legend = FALSE) + 
    xlab('False Positive Rate') + ylab('True Positive Rate') + 
    ggtitle(title) +
    theme(axis.title.x = element_text(size = 8),
          axis.title.y = element_text(size = 8),
          axis.text.x = element_text(size = 8),
          axis.text.y = element_text(size = 8))
}

# Instantiate a function for making the bar plots
make_bar_plots = function(results, meta, title) {
  df = data.frame(healthy.index = results$all.modules.plus.grey.with.tbnks, 
                  condition = meta$condition) %>%
    mutate(condition = as.character(condition)) %>%
    mutate(group = condition %>% # Add in condition super-type
             replace(condition %in% AI, 'AI') %>%
             replace(condition %in% PID, 'PID') %>%
             replace(condition %in% Telo, 'Telo')) %>%
    mutate(group = factor(group, levels = intersect(c('Healthy','AI','Telo','PID'), unique(group)))) %>%
    mutate(condition = abbrev_cond(condition)) # We use the abbreviated condition names

  ## Compute the median healthy index for each condition
  condition.median.healthy.indexes = df %>%
    group_by(condition) %>%
    summarise(condition.median.healthy.index = median(healthy.index))

  ## Add in the median healthy index for each condition to the original data frame
  df = df %>%
    right_join(condition.median.healthy.indexes, by = 'condition') %>%
    arrange(as.numeric(group), desc(condition.median.healthy.index)) %>% # Sort by condition super-type and then median healthy index
    mutate(condition = factor(condition, condition %>% unique)) %>%
    mutate(condition = relevel(condition, abbrev_cond('Healthy'))) %>% # Make sure Healthy is the first level
    mutate(condition = factor(condition, levels = rev(levels(condition))))

  ## Plot the bar plots
  HI_max = max(df$healthy.index) + .01
  p1 = ggplot(df, aes(x = condition, y = healthy.index, fill = group)) + 
    geom_boxplot(outlier.colour = NA) + 
    ylim(0, HI_max) +
    theme_bw() + geom_jitter() + coord_flip() + ggtitle(title) +
    theme(axis.text.x = element_text(size = 15),
          axis.title.x = element_blank(),
          axis.text.y = element_text(size = 15),
          axis.title.y = element_text(size = 15),
          legend.text = element_text(size = 15),
          legend.title = element_text(size = 15),
          legend.key.size = unit(2,"line"))
}

# Instantiate a function for plotting the top features from each classifier
make_pval_plots = function(p.vals, title) {
  p.vals = p.vals$all.modules.plus.grey.with.tbnks
  p.adjusted = p.adjust(p.vals, 'fdr')
  neg.log10.p.adjusted = -1 * log10(p.adjusted)

  ## Create a data frame with the feature names, data type, and negative log 10 pvalues, just for features passing the FDR cutoff
  df = data.frame(label = names(p.vals), p.adjusted = p.adjusted, neg.log10.pvals = neg.log10.p.adjusted) %>%
    filter(p.adjusted < .20) %>%
    select(-p.adjusted) %>%
    mutate(data.type = label %>%
             as.character() %>%
             replace(., grepl('somalogic\\.grey\\.', .), 'Grey\nModule\nProteins') %>%
             replace(., grepl('somalogic\\.modules\\.', .), 'Protein\nModule\nScores') %>%
             replace(., grepl('tbnks\\.', .), 'CBC +\nLymphocyte\nPhenotyping')) %>%
    mutate(label = label %>%
             as.character() %>%
             gsub('somalogic\\.grey\\.', '', .) %>%
             gsub('somalogic\\.modules\\.', '', .) %>%
             gsub('microarray\\.modules\\.', '', .) %>%
             gsub('tbnks\\.', '', .))

  ## We now manually clean up the feature names one-by-one to make them look better when plotting
  df = df %>%
    mutate(label = label %>%
             gsub('nk_cells_percent','NK Cells %', .) %>%
             gsub('nk_cells_abs','# NK Cells', .) %>%
             gsub('MIP.1a','MIP 1a', .) %>%
             gsub('purple','Proteomic Purple Module', .) %>%
             gsub('Cathepsin.H','Cathepsin H', .) %>%
             gsub('IL.18.Ra','IL-18 Receptor 1', .) %>%
             gsub('rdw','RDW', .) %>%
             gsub('LD78.beta','LD78 beta', .))

  ## We order the features by negative log 10 pvalue
  df = df %>% 
    arrange(neg.log10.pvals) %>%
    mutate(label = factor(label, levels = label)) %>%
    mutate(data.type = factor(data.type))

  ## We plot the feature p-values in a bar plot
  p = ggplot(df, aes(y = neg.log10.pvals, x = label, fill = data.type)) + 
    geom_bar(stat="identity") + theme_bw() + xlab('Parameter') + ylab('Negative log10 q-values') + coord_flip() +
    scale_fill_manual(values = c('darkblue','steelblue', 'lightblue')) + labs(fill = 'Data Type') +
    geom_hline(aes(yintercept = -log10(.20), linetype = 'FDR = .20'), color = 'gray', size = 1) +
    scale_linetype_manual(values = 'dashed') + ggtitle(title) +
    theme_bw() +
    theme(axis.title.x = element_text(size = 10),
          axis.text.x = element_text(size = 10),
          axis.title.y = element_text(size = 10),
          axis.text.y = element_text(size = 10),
          legend.text = element_text(size = 10),
          legend.title = element_text(size = 10))
}

# Instantiate a function to plot correlations between healthy indexes in all subjects
make_scatter_plots_one_group = function(results.1, results.2, 
                                        meta.1, meta.2, 
                                        title,
                                        x.label, y.label) {

  overlapping.subjects = intersect(rownames(results.1), rownames(results.2))

  results.1 = results.1[overlapping.subjects, ]
  meta.1 = meta.1[overlapping.subjects, ]
  results.2 = results.2[overlapping.subjects, ]
  meta.2 = meta.2[overlapping.subjects, ]

  stopifnot(as.character(meta.1$condition) == as.character(meta.2$condition))
  meta = meta.1

  hi.df = data.frame(HI.1 = results.1$all.modules.plus.grey.with.tbnks,
                     HI.2 = results.2$all.modules.plus.grey.with.tbnks,
                     condition = meta$condition)

  p = ggplot(hi.df, aes(x = HI.1, y = HI.2)) + geom_smooth(method = 'lm', formula = y ~ x, se = F) + 
    geom_point(aes(color = condition)) + ggpubr::stat_cor() +
    theme_bw() +
    ggtitle(title) + xlab(x.label) + ylab(y.label)
}

# Instantiate a function to plot correlations between healthy indexes in cases and controls
make_scatter_plots_two_group = function(results.1, results.2, 
                                        meta.1, meta.2, 
                                        title,
                                        group1 = 'Healthy', 
                                        group1.name = 'Healthy Control',
                                        group2.name,
                                        x.label, y.label) {

  overlapping.subjects = intersect(rownames(results.1), rownames(results.2))

  results.1 = results.1[overlapping.subjects, ]
  meta.1 = meta.1[overlapping.subjects, ]
  results.2 = results.2[overlapping.subjects, ]
  meta.2 = meta.2[overlapping.subjects, ]

  stopifnot(as.character(meta.1$condition) == as.character(meta.2$condition))
  meta = meta.1

  hi.df = data.frame(HI.1 = results.1$all.modules.plus.grey.with.tbnks,
                     HI.2 = results.2$all.modules.plus.grey.with.tbnks,
                     condition = meta$condition)
  hi.df = hi.df %>% 
    mutate(
      group = ifelse(hi.df$condition %in% group1, group1.name, group2.name) %>%
        factor(levels = c(group1.name, group2.name))
    )

  p = ggplot(hi.df, aes(x = HI.1, y = HI.2, group = group)) + geom_smooth(method = 'lm', formula = y ~ x, se=F) + 
    geom_point(aes(color = group)) + ggpubr::stat_cor(aes(group = group, color = group)) +
    theme_bw() +
    ggtitle(title) + xlab(x.label) + ylab(y.label)
}

# Instantiate a function to plot correlations between healthy indexes in a new group
make_scatter_plots_multi_group = function(results.1, results.2, 
                                  meta.1, meta.2, 
                                  title, x.label, y.label, remove.healthy = T) {

  overlapping.subjects = intersect(rownames(results.1), rownames(results.2))

  results.1 = results.1[overlapping.subjects, ]
  meta.1 = meta.1[overlapping.subjects, ]
  results.2 = results.2[overlapping.subjects, ]
  meta.2 = meta.2[overlapping.subjects, ]

  stopifnot(as.character(meta.1$condition) == as.character(meta.2$condition))
  meta = meta.1

  hi.df = data.frame(HI.1 = results.1$all.modules.plus.grey.with.tbnks,
                     HI.2 = results.2$all.modules.plus.grey.with.tbnks,
                     condition = meta$condition)

  if(remove.healthy) {
    hi.df = hi.df %>% filter(condition != "Healthy")
  }

  hi.df = hi.df %>% 
    filter(condition %in% names(table(condition))[table(condition) > 5])
  p = ggplot(hi.df, aes(x = HI.1, y = HI.2, group = condition)) + 
    geom_smooth(aes(color = condition), method = 'lm', formula = y ~ x, se = F) + 
    geom_point(aes(color = condition)) + ggpubr::stat_cor(aes(group = condition, color = condition)) +
    theme_bw() +
    ggtitle(title) + xlab(x.label) + ylab(y.label)
}

# Start pdf
pdf(PDF.OUT.PATH)

# Addendum Figure 1 -- AUC performance of the original healthy index among AI patients and controls
results = readRDS(ALL.BASED.HI.IN.PATH)
meta = readRDS(ALL.BASED.HI.META.IN.PATH)
results = results[meta$condition %in% c(AI, 'Healthy'), ]
meta = meta[meta$condition %in% c(AI, 'Healthy'), ]
title = 'ROC Curve of Original Classifier LOO CV predictions\namong AI and Healthy Subjects'

p = make_auc_plot(results, meta, title)
print(p)

# Addendum Figure 2 -- AUC performance of the AI-based healthy index among AI patients and controls
results = readRDS(AI.BASED.HI.IN.PATH)
meta = readRDS(AI.BASED.HI.META.IN.PATH)
title = 'ROC Curve of AI-Based Classifier LOO CV predictions\namong AI and Healthy Subjects'

p = make_auc_plot(results, meta, title)
print(p)

# Addendum Figure 3 -- AUC performance of the original healthy index among PID patients and controls
results = readRDS(ALL.BASED.HI.IN.PATH)
meta = readRDS(ALL.BASED.HI.META.IN.PATH)
results = results[meta$condition %in% c(PID, 'Healthy'), ]
meta = meta[meta$condition %in% c(PID, 'Healthy'), ]
title = 'ROC Curve of Original Classifier LOO CV predictions\namong PID and Healthy Subjects'

p = make_auc_plot(results, meta, title)
print(p)

# Addendum Figure 4 -- AUC performance of the original healthy index among PID patients and controls
results = readRDS(PID.BASED.HI.IN.PATH)
meta = readRDS(PID.BASED.HI.META.IN.PATH)
title = 'ROC Curve of PID-Based Classifier LOO CV predictions\namong PID and Healthy Subjects'

p = make_auc_plot(results, meta, title)
print(p)

# Addendum Figure 5 -- Bar plots of original healthy index for AI subjects
results = readRDS(ALL.BASED.HI.IN.PATH)
meta = readRDS(ALL.BASED.HI.META.IN.PATH)
results = results[meta$condition %in% c(AI, 'Healthy'), ]
meta = meta[meta$condition %in% c(AI, 'Healthy'), ]
title = 'Barplots of Original Classifier LOO CV predictions\namong AI and Healthy Subjects'

p = make_bar_plots(results, meta, title)
print(p)

# Addendum Figure 6 -- Bar plots of AI-based healthy index for AI subjects
results = readRDS(AI.BASED.HI.IN.PATH)
meta = readRDS(AI.BASED.HI.META.IN.PATH)
title = 'Barplots of AI-Based Classifier LOO CV predictions\namong AI and Healthy Subjects'

p = make_bar_plots(results, meta, title)
print(p)

# Addendum Figure 7 -- Bar plots of original healthy index for PID subjects
results = readRDS(ALL.BASED.HI.IN.PATH)
meta = readRDS(ALL.BASED.HI.META.IN.PATH)
results = results[meta$condition %in% c(PID, 'Healthy'), ]
meta = meta[meta$condition %in% c(PID, 'Healthy'), ]
title = 'Barplots of Original Classifier LOO CV predictions\namong PID and Healthy Subjects'

p = make_bar_plots(results, meta, title)
print(p)

# Addendum Figure 8 -- Bar plots of PID-based healthy index for PID subjects
results = readRDS(PID.BASED.HI.IN.PATH)
meta = readRDS(PID.BASED.HI.META.IN.PATH)
title = 'Barplots of PID-Based Classifier LOO CV predictions\namong PID and Healthy Subjects'
p = make_bar_plots(results, meta, title)
print(p)

# Addendum Figure 9 -- p value plots for original classifier
p.vals = readRDS(ALL.BASED.HI.PVALS.IN.PATH)
title = 'Negative log10 adjusted pvalues for feature GVI\nin original classifier'
p = make_pval_plots(p.vals, title = title)
print(p)

# Addendum Figure 10 -- p value plots for the AI-based classifier
p.vals = readRDS(AI.BASED.HI.PVALS.IN.PATH)
title = 'Negative log10 adjusted pvalues for feature GVI\nin AI-based classifier'
p = make_pval_plots(p.vals, title = title)
print(p)

# Addendum Figure 11 -- p value plots for the PID-based classifier
p.vals = readRDS(PID.BASED.HI.PVALS.IN.PATH)
title = 'Negative log10 adjusted pvalues for feature GVI\nin PID-based classifier'
p = make_pval_plots(p.vals, title = title)
print(p)

# Addendum Figure 12 -- correlation between original healthy index and AI-based healthy index
results.all = readRDS(ALL.BASED.HI.IN.PATH)
results.ai = readRDS(AI.BASED.HI.IN.PATH)
meta.all = readRDS(ALL.BASED.HI.META.IN.PATH)
meta.ai = readRDS(AI.BASED.HI.META.IN.PATH)

p = make_scatter_plots_two_group(results.1 = results.all, results.2 = results.ai, 
                                 meta.1 = meta.all, meta.2 = meta.ai, 
                                 title = 'Correlation between original healthy index\nand AI-Based healthy index', 
                                 group1.name = 'Healthy', group2.name = 'AI',
                                 x.label = 'Original HI', y.label = 'AI-Based HI')

print(p)

# Addendum Figure 13 -- correlation between original healthy index and AI-based healthy index
results.all = readRDS(ALL.BASED.HI.IN.PATH)
results.ai = readRDS(AI.BASED.HI.IN.PATH)
meta.all = readRDS(ALL.BASED.HI.META.IN.PATH)
meta.ai = readRDS(AI.BASED.HI.META.IN.PATH)

p = make_scatter_plots_multi_group(results.1 = results.all, results.2 = results.ai, 
                                 meta.1 = meta.all, meta.2 = meta.ai, 
                                 title = 'Correlation between original healthy index\nand AI-Based healthy index',
                                 x.label = 'Original HI', y.label = 'AI-Based HI')
print(p)

# Addendum Figure 14 -- correlation between original healthy index and AI-based healthy index
results.all = readRDS(ALL.BASED.HI.IN.PATH)
results.pid = readRDS(PID.BASED.HI.IN.PATH)
meta.all = readRDS(ALL.BASED.HI.META.IN.PATH)
meta.pid = readRDS(PID.BASED.HI.META.IN.PATH)

p = make_scatter_plots_two_group(results.1 = results.all, results.2 = results.pid, 
                                 meta.1 = meta.all, meta.2 = meta.pid, 
                                 title = 'Correlation between original healthy index\nand PID-Based healthy index', 
                                 group1.name = 'Healthy', group2.name = 'PID',
                                 x.label = 'Original HI', y.label = 'PID-Based HI')
print(p)

# Addendum Figure 15 -- correlation between original healthy index and AI-based healthy index
results.all = readRDS(ALL.BASED.HI.IN.PATH)
results.pid = readRDS(PID.BASED.HI.IN.PATH)
meta.all = readRDS(ALL.BASED.HI.META.IN.PATH)
meta.pid = readRDS(PID.BASED.HI.META.IN.PATH)

p = make_scatter_plots_multi_group(results.1 = results.all, results.2 = results.pid, 
                                 meta.1 = meta.all, meta.2 = meta.pid, 
                                 title = 'Correlation between original healthy index\nand PID-Based healthy index', 
                                 x.label = 'Original HI', y.label = 'PID-Based HI')
print(p)

# Addendum Figure 16 -- correlation between AI-based healthy index and PID-based healthy index among healthy controls
results.ai = readRDS(AI.BASED.HI.IN.PATH)
results.pid = readRDS(PID.BASED.HI.IN.PATH)
meta.ai = readRDS(AI.BASED.HI.META.IN.PATH)
meta.pid = readRDS(PID.BASED.HI.META.IN.PATH)

p = make_scatter_plots_multi_group(results.1 = results.ai, results.2 = results.pid, 
                                   meta.1 = meta.ai, meta.2 = meta.pid, 
                                   title = 'Correlation between AI-Based healthy index\nand PID-Based healthy index among healthy controls', 
                                   x.label = 'AI-Based HI', y.label = 'PID-Based HI',
                                   remove.healthy = F)
print(p)

# Addendum Figures 17 & 18 -- correlation between AI-based healthy index and PID-based healthy index among AI patients
results.pid.from.ai = readRDS(PID.PREDICTIONS.FROM.AI.IN.PATH)
results.pid.from.pid = readRDS(PID.BASED.HI.IN.PATH)
meta.pid = readRDS(PID.BASED.HI.META.IN.PATH)

meta.pid = meta.pid[meta.pid$condition %in% PID,]
p = make_scatter_plots_one_group(results.1 = results.pid.from.ai, results.2 = results.pid.from.pid, 
                                   meta.1 = meta.pid, meta.2 = meta.pid, 
                                   title = 'Correlation between AI-Based healthy index\nand PID-Based healthy index\namong PID subjects', 
                                   x.label = 'AI-Based HI', y.label = 'PID-Based HI')
print(p)

p = make_scatter_plots_multi_group(results.1 = results.pid.from.ai, results.2 = results.pid.from.pid, 
                                 meta.1 = meta.pid, meta.2 = meta.pid, 
                                 title = 'Correlation between AI-Based healthy index\nand PID-Based healthy index\namong PID subjects (condition-specific)', 
                                 x.label = 'AI-Based HI', y.label = 'PID-Based HI')
print(p)

# Addendum Figures 19 & 20 -- correlation between AI-based healthy index and PID-based healthy index among PID patients
results.ai.from.pid = readRDS(AI.PREDICTIONS.FROM.PID.IN.PATH)
results.ai.from.ai = readRDS(AI.BASED.HI.IN.PATH)
meta.ai = readRDS(AI.BASED.HI.META.IN.PATH)

meta.ai = meta.ai[meta.ai$condition %in% AI,]
p = make_scatter_plots_one_group(results.1 = results.ai.from.pid, results.2 = results.ai.from.ai, 
                                 meta.1 = meta.ai, meta.2 = meta.ai, 
                                 title = 'Correlation between PID-Based healthy index\nand AI-Based healthy index\namong AI subjects', 
                                 x.label = 'PID-Based HI', y.label = 'AI-Based HI')
print(p)

p = make_scatter_plots_multi_group(results.1 = results.ai.from.pid, results.2 = results.ai.from.ai, 
                                 meta.1 = meta.ai, meta.2 = meta.ai, 
                                 title = 'Correlation between PID-Based healthy index\nand AI-Based healthy index\namong AI subjects (condition-specific)', 
                                 x.label = 'PID-Based HI', y.label = 'AI-Based HI')
print(p)

# Addendum Figures 21 & 22 -- correlation between AI-based healthy index and original healthy index among PID patients
results.pid.from.ai = readRDS(PID.PREDICTIONS.FROM.AI.IN.PATH)
results.pid.from.all = readRDS(ALL.BASED.HI.IN.PATH)
meta.all = readRDS(ALL.BASED.HI.META.IN.PATH)

meta.all = meta.all[meta.all$condition %in% PID,]
p = make_scatter_plots_one_group(results.1 = results.pid.from.all, results.2 = results.pid.from.ai, 
                                 meta.1 = meta.all, meta.2 = meta.all, 
                                 title = 'Correlation between AI-Based healthy index\nand original healthy index\namong PID subjects', 
                                 x.label = 'Original HI', y.label = 'AI-Based HI')
print(p)

p = make_scatter_plots_multi_group(results.1 = results.pid.from.all, results.2 = results.pid.from.ai, 
                                 meta.1 = meta.all, meta.2 = meta.all, 
                                 title = 'Correlation between AI-Based healthy index\nand original healthy index\namong PID subjects (condition-specific)', 
                                 x.label = 'Original HI', y.label = 'AI-Based HI')
print(p)

# Addendum Figures 23 & 24 -- correlation between AI-based healthy index and original healthy index among AI patients
results.ai.from.pid = readRDS(AI.PREDICTIONS.FROM.PID.IN.PATH)
results.ai.from.all = readRDS(ALL.BASED.HI.IN.PATH)
meta.all = readRDS(ALL.BASED.HI.META.IN.PATH)

meta.all = meta.all[meta.all$condition %in% AI,]
p = make_scatter_plots_one_group(results.1 = results.ai.from.all, results.2 = results.ai.from.pid, 
                                 meta.1 = meta.all, meta.2 = meta.all, 
                                 title = 'Correlation between PID-Based healthy index\nand original healthy index\namong AI subjects', 
                                 x.label = 'Original HI', y.label = 'PID-Based HI')
print(p)

p = make_scatter_plots_multi_group(results.1 = results.ai.from.all, results.2 = results.ai.from.pid, 
                                 meta.1 = meta.all, meta.2 = meta.all, 
                                 title = 'Correlation between PID-Based healthy index\nand original healthy index\namong AI subjects (condition-specific)', 
                                 x.label = 'Original HI', y.label = 'PID-Based HI')
print(p)
dev.off()

R ggplot2 dplyr tidyr From line 2 of Supplemental_Figure_4/supplemental_figure_4_addendum_version_1.R

library(ggplot2)
library(gridExtra)
library(ggrepel)
library(ggpubr)
library(dplyr)
library(tidyr)
library(reshape2)


if(!exists("snakemake")){
  setwd("../../..")
  source("scripts/util/paper/parse_snakemake.R")
  parse_snakemake(rule = "supplemental_figure_4")
}

source("scripts/util/Plotting/tbnk_featurename_replace.R")

# Set paths
## the monogenic metadata database
META.IN.PATH = snakemake@input[[1]]#'Classification/healthy_random_forest_sample_meta_data_all.RDS'
## the LLOO CV results for each patient for each classifier (i.e. the various healthy indexes)
HI.IN.PATH = snakemake@input[[2]]#'Classification/results/healthy_rf_results_all.RDS'
## the results of the JIVE algorithm
JIVE.IN.PATH = snakemake@input[[3]]#"Integration_output/jive/subject/prcomp_list.rds"
## the pvalues for each feature in the random forest classifier
GVI.PVALS.IN.PATH = snakemake@input[[4]]#"Classification/results/healthy_rf_pvals_all.RDS"
## the somalogic subject-level training eset
ESET.IN.PATH = snakemake@input[[5]]#"Data/Somalogic/data_analysis_ready/analysis_ready_subject_level_training_somalogic.rds" 

## The LOO CV results for each patient for each condition-based random forest classifier
HI.CONDITION.IN.PATHS = list(
  CGD = snakemake@input[[6]],#'Classification/results/cgd_rf_results_all.RDS',
  STAT1.GOF = snakemake@input[[7]],#'Classification/results/stat1_rf_results_all.RDS',
  FMF = snakemake@input[[8]],#'Classification/results/fmf_rf_results_all.RDS',
  Job = snakemake@input[[9]]#'Classification/results/job_rf_results_all.RDS'
)

## The meta data used for each condition-based random forest classifier
RF.META.IN.PATHS = list(
  CGD = snakemake@input[[10]],#'Classification/cgd_random_forest_sample_meta_data_all.RDS',
  STAT1.GOF = snakemake@input[[11]],#'Classification/stat1_random_forest_sample_meta_data_all.RDS',
  FMF = snakemake@input[[12]],#'Classification/fmf_random_forest_sample_meta_data_all.RDS',
  Job = snakemake@input[[13]]#'Classification/job_random_forest_sample_meta_data_all.RDS'
)

## The condition-based random forest classifier results for the pvalue of the GVI of each feature via permutation testing
HI.CONDITION.PVALS.IN.PATH = list(
  CGD = snakemake@input[[14]],#'Classification/results/cgd_rf_pvals_all.RDS',
  STAT1.GOF = snakemake@input[[15]],#'Classification/results/stat1_rf_pvals_all.RDS',
  FMF = snakemake@input[[16]],#'Classification/results/fmf_rf_pvals_all.RDS',
  Job = snakemake@input[[17]]#'Classification/results/job_rf_pvals_all.RDS'
)

SUPPLEMENTAL.FIGURE.4a.OUT.PATH = snakemake@output[[1]]#'Paper_1_Figures/Supplemental_Figure_1/S4a.pdf'
SUPPLEMENTAL.FIGURE.4b.OUT.PATH = snakemake@output[[2]]#'Paper_1_Figures/Supplemental_Figure_1/S4b.pdf'
SUPPLEMENTAL.FIGURE.4c.OUT.PATH = snakemake@output[[3]]#'Paper_1_Figures/Supplemental_Figure_1/S4c.pdf'
SUPPLEMENTAL.FIGURE.4d.OUT.PATH = snakemake@output[[4]]#'Paper_1_Figures/Supplemental_Figure_1/S4d.pdf'
SUPPLEMENTAL.FIGURE.4e.OUT.PATH = snakemake@output[[5]]#'Paper_1_Figures/Supplemental_Figure_1/S4e.pdf'
SUPPLEMENTAL.FIGURE.4f.OUT.PATH = snakemake@output[[6]]#'Paper_1_Figures/Supplemental_Figure_1/S4f.pdf'
SUPPLEMENTAL.FIGURE.4g.OUT.PATH = snakemake@output[[7]]#'Paper_1_Figures/Supplemental_Figure_1/S4g.pdf'


# Source utilities
source('scripts/util/Plotting/plot_auc.R')
source('scripts/util/paper/abbrev_cond.R')
# Supplemental figure 4a - Healthy index ROC curves by age group

## Load data
meta = readRDS(META.IN.PATH)
results = readRDS(HI.IN.PATH)

## Extract the HI for each patient
results = results$all.modules.plus.grey.with.tbnks

## Instantiate an empty list to hold ROC data frames
dfs = list()

## Separate the range of ages into three groups
ages = c(0,15,50,100)
groups = c('< 15 yrs','15-50 yrs','> 50 yrs')

## For each age group
for(i in 1:3) {

  group = groups[i]

  ## Find the subjects from that age group
  select = meta$Age > ages[i] & meta$Age <= ages[i+1]

  ## Get the HIs of these subjects
  x = results[select]

  ## Get the conditions of these subjects
  y = meta$condition[select]

  ## Create an ROC curve for these subjects
  roc = get_roc(x, y, pos = 'Healthy')

  ## Get the associated ROC
  auc = get_auc(roc)

  ## Store the ROC curve as a data frame, with the age group and auc appended
  df = roc
  df$Age.Group = paste0(group,': ', format(auc, digits = 2))

  ## Add the dataframe to the list of data frames
  dfs[[length(dfs) + 1]] = df
}

## Put the dataframes from each age group together
df = Reduce(rbind, dfs)

## Plot the ROC curves
p = ggplot(df, aes(x = fpr, y = tpr, color = Age.Group)) + geom_line() +
  geom_abline(slope = 1, linetype = 'dashed', color = 'grey') + 
  theme_bw() +
  xlab('False Positive Rate') + ylab('True Positive Rate') + labs(color = 'Classifier: AUC') + 
  theme(axis.title.x = element_text(size = 10),
        axis.title.y = element_text(size = 10),
        axis.text.x = element_text(size = 10),
        axis.text.y = element_text(size = 10),
        legend.title = element_text(size = 10),
        legend.text = element_text(size = 10),
        legend.position = c(.99, .01),
        legend.justification = c(1, 0))

ggsave(SUPPLEMENTAL.FIGURE.4a.OUT.PATH, p, device = 'pdf', height = 5, width = 5)

# Supplemental figure 4b -- ROC curves for the various classifiers

## Load the meta data and random forest LOO CV predictions for each subject
meta = readRDS(META.IN.PATH)
results = readRDS(HI.IN.PATH)

## For each classifier
dfs = mapply(function(x, name) {
  ## Get the ROC curve for that classifier
  roc = get_roc(x = x, y = meta$condition, pos = 'Healthy')
  ## Get the AUC associated with that classifier
  auc = get_auc(roc)
  ## Associate the classifier name and AUC with the ROC dataframe
  roc$Classifier = name
  roc$AUC = auc
  ## Return the ROC dataframe
  return(roc)
}, results, names(results), SIMPLIFY = FALSE)

## Combine the ROC data frames from each classifier
df = Reduce(rbind, dfs)

## Convert the classifier names to be more clear for the plot
conversion = c("microarray.modules" = 'Gene modules', 
               "tbnks" = 'CBC + TBNK',
               "cbcs" = 'CBC',
               "somalogic.modules" = 'Protein modules', 
               "all.modules.with.tbnks" = 'Modules + CBC + TBNK', 
               "all.modules.plus.grey.with.tbnks" = 'Modules + CBC + TBNK + Grey Proteins')
df$Classifier = conversion[df$Classifier]

## Sort the classifiers based on AUC
df = df %>%
  mutate(AUC.formatted = format(df$AUC, digits = 2)) %>%
  mutate(Classifier = paste0(Classifier, ': ', AUC.formatted)) %>%
  arrange(AUC) %>%
  mutate(Classifier = factor(Classifier, levels = unique(Classifier)))

## Plot the various ROC curves together
p = ggplot(df, aes(x = fpr, y = tpr, color = Classifier)) + geom_line() +
  scale_color_manual(values = c('#F8766D', ## we manually choose the colors
                                '#A3A500',
                                '#00BF7D',
                                '#00B0F6',
                                '#E76BF3',
                                '#000000')) +
  geom_abline(slope = 1, linetype = 'dashed', color = 'grey') + ## Include the line y = x to show the performance of a theoretical naive classifier
  theme_bw() +
  xlab('False Positive Rate') + ylab('True Positive Rate') + labs(color = 'Classifier: AUC') + 
  theme(axis.title.x = element_text(size = 9),
        axis.title.y = element_text(size = 9),
        axis.text.x = element_text(size = 9),
        axis.text.y = element_text(size = 9),
        legend.title = element_text(size = 9),
        legend.text = element_text(size = 9),
        legend.position = c(.99, .01),
        legend.justification = c(1, 0))

ggsave(SUPPLEMENTAL.FIGURE.4b.OUT.PATH, p, device = 'pdf', height = 4, width = 4.5)

# Supplemental Figure 4c -- JIVE versus HI
jive = readRDS(JIVE.IN.PATH) ## Get the jive results
results = readRDS(HI.IN.PATH) ## Get the LOO CV predictions

## Extract the healthy index from the LOO CV predictions
predictions = results[,'all.modules.plus.grey.with.tbnks']
names(predictions) = rownames(results)

## Instantiate a function to create a data frame with healthy index and jive PCs 1:3 from one of the jive matrices
get_df = function(mat.name) {
  pcs = jive[[mat.name]]$x
  ids = intersect(rownames(results), rownames(pcs))
  pcs = pcs[ids, 1:3]
  df = as.data.frame(pcs)
  df$id = rownames(df)
  df = gather(df, key = "variable", value = "value", -id)
  df$predictions = predictions[df$id]
  df$matrix = mat.name
  return(df)
}

mat.names = c('joint','array.ind','soma.ind') ## The various JIVE matrices

## Run the function on each jive matrix
dfs = lapply(mat.names, get_df)
## And stick the results together
df = Reduce(rbind, dfs)

## Order the matrix types by joint, gene, and protein
df <- df %>% 
        mutate(matrix = gsub("joint", "Joint", matrix)) %>%
        mutate(matrix = replace(matrix, matrix == "array.ind", "Transcriptome\nIndividual")) %>%
        mutate(matrix = replace(matrix, matrix == "soma.ind", "Proteome\nIndividual"))

df$matrix = factor(df$matrix, levels = c("Joint", "Transcriptome\nIndividual",
                                         "Proteome\nIndividual"))

## Plot a grid of scatterplots of the correlations between the HI and JIVE PC
## for the joint, protein, and gene JIVE outputs and for PCs 1:3 for each output
p = ggplot(df, aes(x = predictions, y = value)) + ylab('PC Score') +
  xlab('Immune Health Metric') +
  geom_point() + geom_smooth(method = 'lm', formula = y~x, se = FALSE) +
  facet_grid(rows = vars(variable), cols = vars(matrix), scales = "free_y") +
  #xlim(0, .75) +
  stat_cor() +
  theme_bw() + 
  theme(axis.title.x = element_text(size = 15),
        axis.text.x = element_text(size = 15),
        axis.text.y = element_text(size = 15),
        axis.title.y = element_text(size = 15),
        strip.text = element_text(size = 15))

ggsave(SUPPLEMENTAL.FIGURE.4c.OUT.PATH, p, device = 'pdf', height = 5, width = 7)

# Supplemental Figure 4d -- feature-by-feature view of each classifier

## Load the pvalues associated with each feature in the healthy classifiers based on permutation testing of the GVI
results = readRDS(GVI.PVALS.IN.PATH)

## Convert each classifier's name to something more clear (i.e. the data types used by that classifier)
name_conversions = c(cbcs = 'CBC', tbnks = 'CBC + TBNK', microarray.modules = 'Gene Modules', 
                     somalogic.modules = 'Protein Modules', all.modules.with.tbnks = 'Modules + CBC +\nTBNK',
                     all.modules.plus.grey.with.tbnks = 'Modules + CBC +\nTBNK +\nGrey Proteins')

## Instantiate a function to get the negative log10 adjusted pvalue from the permutation test of each feature in the classifier
get.df = function(result, classifier) {
  p.adjusted = p.adjust(result, 'fdr')
  neg.log10.p.adjusted = -1 * log10(p.adjusted)
  df = data.frame(p.adjusted = p.adjusted, neg.log10.pvals = neg.log10.p.adjusted, classifier = classifier)
  df$label = rownames(df)
  return(df)
}

## Run this function on each classifier
dfs = mapply(get.df, results, names(results), SIMPLIFY = FALSE)

## Put all these results together
df = Reduce(rbind, dfs)

## Format the data frames to label significant features
df = df %>%
  mutate(classifier = name_conversions[classifier] %>% factor(levels = name_conversions)) %>%
  mutate(significant = ifelse(p.adjusted < .20, '< .20', '> .20') %>% factor(levels = c('> .20', '< .20'))) %>%
  mutate(label = replace(label, significant == '> .20', ""))

## Manually change some of the labels to be more clear or concise
df = df %>% 
  mutate(label = label %>%
           gsub('tbnks\\.','',.) %>%
           gsub('somalogic\\.grey\\.','',.) %>%
           gsub('somalogic','protein',.) %>%
           gsub('microarray','rna', .) %>%
           gsub("nk_cells_abs", "NK cells(#)", .) %>%
           gsub("nk_cells_percent", "NK cells(%)", .) %>%
           gsub("protein\\.modules\\.purple", "PM2", .) %>%
           gsub('rdw','RDW', .) %>%
           gsub('\\.',' ',.) %>%
           gsub('_',' ',.)
   )

   #%>%
           #gsub('\\.modules','',.) %>%
           #gsub('_',' ',.))

## Instantiate an empty list to hold the plot for each classifier
ps = list()

## Manually choose which color should be used for each classifier (what's science without some art?)
colors = c('CBC' = 'magenta',
           'CBC + TBNK' = 'red',
           'Gene Modules' = 'green',
           'Protein Modules' = 'blue',
           'Modules + CBC +\nTBNK' = 'purple',
           'Modules + CBC +\nTBNK +\nGrey Proteins' = 'black')

## Set the order for the classifier legend
df$classifier = factor(df$classifier, names(colors))

## For each classifier
for(classifier in levels(df$classifier)) {
  ## Get the color for that classifier
  col = colors[[classifier]]
  ## Subset the combined data frame to just the results for that classifier
  df.subset = df %>% filter(classifier == !!classifier)
  ## Create a scatter plot of each classifier (where x is always 0 to keep the points in a vertical line)
  p = ggplot(data = df.subset, aes(x = 0, y = neg.log10.pvals, label = label, color = significant)) + 
    geom_point(show.legend = FALSE) + 
    scale_color_manual(values = c('grey', col)) + # Make the non-significant features grey and the significant ones the chosen color
    xlab(classifier) + 
    ylab('Negative log10 p-value') +
    geom_text_repel(direction='y', nudge_x = .025, hjust = 0, show.legend = FALSE) + # Nudge the feature labels
    xlim(0, .1) + ylim(0, 2.5) + theme_bw()

  ## The classifier is the CBCs, which forms the rightmost panel, allow it to display the y-axis title
  if(classifier == 'CBCs') {
    p = p + theme(
      axis.line.x  = element_blank(),
      axis.ticks.x = element_blank(),
      axis.text.x  = element_blank(),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.background = element_blank(),
      panel.border = element_blank(),
      axis.line.y = element_line(colour = "black"))
  } else {
    p = p + theme(
      axis.line.x  = element_blank(),
      axis.ticks.x = element_blank(),
      axis.text.x  = element_blank(),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.border = element_blank(),
      panel.background = element_blank(),
      axis.title.y = element_blank(),
      axis.line.y = element_line(colour = "black"))
  }

  ## Add the plot to the grid
  ps[[length(ps) + 1]] = p
}

## Print the grid
pdf(SUPPLEMENTAL.FIGURE.4d.OUT.PATH, height = 4, width = 11)
grid.arrange(ps[[1]], ps[[2]], ps[[3]], ps[[4]], ps[[5]], ps[[6]], ncol = 6)
dev.off()

# Supplemental Figure 4e -- jPC1-age relationship among top conditions 
jive = readRDS(JIVE.IN.PATH) ## Extract the Jive results
eset = readRDS(ESET.IN.PATH) ## Extract the somalogic eset

## Get the first PC
jPC1 = jive$joint$x[,1]
eset = eset[, names(jPC1)]

## Set the conditions to display
conditions = c('Healthy','47CGD','XCGD','Job','STAT1 GOF','FMF')

## Create a data frame displaying the Age and jPC1 for each condition
df = data.frame(jPC1 = jPC1, condition = eset$condition, age = eset$Age) %>%
  filter(condition %in% conditions) %>%
  mutate(condition = condition %>% as.character %>% abbrev_cond) %>%
  mutate(condition = factor(condition, levels = abbrev_cond(conditions)))

## Create the the Age-PC1 scatterplot for each condition and plot in a grid
p = ggplot(df, aes(x = age, y = jPC1)) + geom_point() + 
  facet_wrap(~condition, ncol = 3, nrow = 2) +
  geom_smooth(method = 'lm', formula = y~x, se = FALSE) +
  ylim(-50, 50) + xlim(0, 80) +
  stat_cor(label.x = 0, label.y = 45) +
  theme_bw() + 
  theme(axis.title.x = element_text(size = 15),
        axis.text.x = element_text(size = 15),
        axis.text.y = element_text(size = 15),
        axis.title.y = element_text(size = 15),
        strip.text = element_text(size = 15))

ggsave(SUPPLEMENTAL.FIGURE.4e.OUT.PATH, p, device = 'pdf', height = 4, width = 6)

# Supplemental Figure 4f -- condition-specific classifiers
results = lapply(HI.CONDITION.IN.PATHS, readRDS) ## Extract the condition-specific classifier results 
metas = lapply(RF.META.IN.PATHS, readRDS) ## Extact the meta data assocaited with each condition-specific classifier

## List the condition groups for each classifier
condition.groups = list(CGD = c('XCGD', '47CGD'),
                        STAT1.GOF = 'STAT1 GOF',
                        FMF = 'FMF',
                        Job = 'Job')

## Create a name conversion map to make the data types underlying each classifier more clear
conversion = c("microarray.modules" = 'Gene modules', 
               "tbnks" = 'CBCs + TBNK',
               "cbcs" = 'CBCs',
               "somalogic.modules" = 'Protein modules', 
               "all.modules.with.tbnks" = 'Modules + CBC + TBNK', 
               "all.modules.plus.grey.with.tbnks" = 'Modules + CBC + TBNK + Grey Proteins')

## Insantiate a function to get the AUC associated with each classifier and each condition
get_aucs = function(result, meta, condition.group) {
  ## Get the condition associated with each patient
  conditions = meta[rownames(result), 'condition']
  apply(result, 2, function(x) {
    ## Get the ROC curve associated with each classifier
    roc = get_roc(x = x, y = conditions, pos = condition.group)
    ## Get the AUC of that ROC curve
    get_auc(roc)
  })
}

## Run the function on each of the condition-specific classifier results (and simplify into a matrix)
aucs = mapply(get_aucs, results, metas, condition.groups, SIMPLIFY = T)

## Create a data frame holding the AUCs for each classifier, and melt it
df = as.data.frame(aucs) %>% 
  tibble::rownames_to_column(var = 'classifier') %>%
  mutate(classifier = conversion[classifier]) %>%
  mutate(classifier = factor(classifier, levels = conversion)) %>%
  melt()

## Create grouped barplots for each classifier and each condition
p = ggplot(df, aes(x = variable, y = value, fill = classifier)) + 
  geom_bar(stat = 'identity', position = 'dodge') +
  theme_bw() + labs(fill = 'Classifier') + xlab('Condition') + ylab('AUC')

ggsave(SUPPLEMENTAL.FIGURE.4f.OUT.PATH, p, device = 'pdf', height = 6, width = 9)

# Supplemental Figure 4g -- heatmap of gvis
gvi.pvals = lapply(HI.CONDITION.PVALS.IN.PATH, readRDS) ## Get the GVI pvalues associated with each classifier and each condition
pvals = sapply(gvi.pvals, function(x) {x$all.modules.plus.grey.with.tbnks}) ## Extract the pvalues for the features in the classifier with all data types
pvals = as.data.frame(pvals) ## Orangize this matrix into a data frame

## Get the top 5 features from each condition's classifier
top_features = lapply(colnames(pvals), function(group) {
  x = rownames(pvals)[order(pvals[[group]], decreasing = FALSE)]
  x = x[1:5]
})
top_features = unique(unlist(top_features))

## Adjust the pvalues using BH correction within each classifer, and get the negative log 10 adjusted pvalues
pvals = apply(pvals, 2, function(x) {
  x = p.adjust(x, 'fdr')
  x = -log10(x)
})

## Subject to just the top features
pvals = pvals[top_features, ]

## Create an index to associate each feature and each condition with a row and column
n = nrow(pvals)
m = ncol(pvals)
xs = t(matrix(1:m, nrow = m, ncol = n))
ys = matrix(1:n, nrow = m, ncol = n)

## Put the pvalue results, x-indexes, and y-indexes into a data frame
df = data.frame(x = xs[1:(n*m)], y = ys[1:(n*m)], NLP = pvals[1:(n*m)])
df$x = factor(df$x)
levels(df$x) = colnames(pvals)
df$y = factor(df$y)
levels(df$y) = rownames(pvals)

## And plot the associated heatmap using the ggplot tile function
p = ggplot(df, aes(x = x, y = y, fill = NLP)) + geom_tile() + theme_bw() + 
  xlab('Condition') + ylab('Feature') + labs(fill = 'Negative log10 pvalue')
ggsave(SUPPLEMENTAL.FIGURE.4g.OUT.PATH, p, device = 'pdf', height = 6, width = 9)

R Snakemake ggplot2 dplyr tidyr ggrepel ggpubr reshape2 gridExtra From line 2 of Supplemental_Figure_4/supplemental_figure_4_version_5.R

library(tidyverse)
library(ggpubr)

IHM.IN.PATH <- snakemake@input[[1]]
META.IN.PATH <- snakemake@input[[2]]

FIG.OUT.PATH <- snakemake@output[[1]]

ihm <- readRDS(IHM.IN.PATH)

meta <- readRDS(META.IN.PATH)

#mods <- readRDS("../Pipeline_out/Classification/results_no_pm2/healthy_rf_models_all.RDS")
#
#hmod <- mods$all.modules.plus.grey.with.tbnks
#
##don't see soma purple mod
#hmod$importance

ihm <- ihm$all.modules.plus.grey.with.tbnks

dat <- meta %>%
        mutate(ihm = ihm)


#keep_cond <- c("47CGD", "XCGD", "Healthy", "Job", "STAT1 GOF", "FMF")
keep_cond <- c("Healthy")
p <- dat %>%
        filter(condition %in% keep_cond) %>%
        ggplot(aes(x = Age, y = ihm)) +
        geom_point() +
        geom_smooth(method = "lm", se = FALSE) +
        stat_cor(method = "spearman") +
        theme_bw() +
        facet_wrap(~condition)

ggsave(plot = p, filename = FIG.OUT.PATH, height = 3, width = 3)

R ggplot2 tidyverse ggpubr From line 2 of Supplemental_Figure_5/no_pm2_model_age_cor.R

library(ggplot2)
library(Biobase)
library(ggpubr)
library(MetaIntegrator)
library(limma)
library(dplyr)


# Set paths
if(exists("snakemake")){
  ## The effect sizes associated with each feature in the meta analysis
  META.ANALYSIS.Z.SCORE.IN.PATH = snakemake@input[[1]]#'Reference/jamboree/analysis_output/results/jamboree_z_score_results.RDS'
  ## The comparison group pairs from the jamboree
  CGPS.IN.PATH = snakemake@input[[2]]#'Reference/jamboree/data_analysis_ready/cgps_clean.RDS'
  ## The transcriptional surrogate signature gene set enrichment results associated with each feature in the meta-analysis
  META.ANALYSIS.ENRICHMENTS.IN.PATH = snakemake@input[[3]]#'Reference/jamboree/analysis_output/results/jamboree_enrichment_results.RDS'
  ## The baltimore aging cohort eset
  ESET.IN.PATH = snakemake@input[[4]]#'Reference/ferrucci/processed/aging_eset.RDS'
  ## The plasma somalogic surrogate signature for the healthy index
  SIGNATURE.IN.PATH = snakemake@input[[5]]#'Classification/proteomic_surrogates/healthy.index.plasma.surrogate.somaId.RDS'
  ## The Ferrucci significance table
  TABLE.IN.PATH = snakemake@input[[6]]#'Reference/ferrucci/raw/acel12799-sup-0004-TableS3.txt'

  SUPPLEMENTAL.FIGURE.5a.OUT.PATH = snakemake@output[[1]]#'Paper_1_Figures/Supplemental_Figure_5/S5a.pdf'
  SUPPLEMENTAL.FIGURE.5b.OUT.PATH = snakemake@output[[2]]#'Paper_1_Figures/Supplemental_Figure_5/S5b.pdf'
  SUPPLEMENTAL.FIGURE.5c.OUT.PATH = snakemake@output[[3]]#'Paper_1_Figures/Supplemental_Figure_5/S5c.pdf'
}else{
  ## The effect sizes associated with each feature in the meta analysis
  META.ANALYSIS.Z.SCORE.IN.PATH = 'Reference/jamboree/analysis_output/results/jamboree_z_score_results.RDS'
  ## The comparison group pairs from the jamboree
  CGPS.IN.PATH = 'Reference/jamboree/data_analysis_ready/cgps_clean.RDS'
  ## The transcriptional surrogate signature gene set enrichment results associated with each feature in the meta-analysis
  META.ANALYSIS.ENRICHMENTS.IN.PATH = 'Reference/jamboree/analysis_output/results/jamboree_enrichment_results.RDS'
  ## The baltimore aging cohort eset
  ESET.IN.PATH = 'Reference/ferrucci/processed/aging_eset.RDS'
  ## The plasma somalogic surrogate signature for the healthy index
  SIGNATURE.IN.PATH = 'Classification/proteomic_surrogates/healthy.index.plasma.surrogate.somaId.RDS'
  ## The Ferrucci significance table
  TABLE.IN.PATH = 'Reference/ferrucci/raw/acel12799-sup-0004-TableS3.txt'

  SUPPLEMENTAL.FIGURE.5a.OUT.PATH = 'Paper_1_Figures/Supplemental_Figure_5/S5a.pdf'
  SUPPLEMENTAL.FIGURE.5b.OUT.PATH = 'Paper_1_Figures/Supplemental_Figure_5/S5b.pdf'
  SUPPLEMENTAL.FIGURE.5c.OUT.PATH = 'Paper_1_Figures/Supplemental_Figure_5/S5c.pdf'

  setwd("/hpcdata/sg/sg_data/PROJECTS/Monogenic_Project")
}

# Source utilities
source('scripts/util/Enrichment/hyperGeo.R')
source('scripts/util/Signatures/get_signature_scores.R')
# Figure 5a -- Forest plots of signature scores for each study in the meta-analysis for somalogic grey module proteins

## Load meta-analysis result and comparison group pairs
results = readRDS(META.ANALYSIS.Z.SCORE.IN.PATH)
cgps = readRDS(CGPS.IN.PATH)

## Get a map between the study name and its corresponding disease
### Create an empty vector to hold these names
studiess = c()
### For each disease
for(disease in names(cgps)) {
  ### Get the name of the studies for that disease
  studies = names(cgps[[disease]])
  ### Create a map from the study name to its corresponding disease
  new_studies = rep(disease, length(studies))
  names(new_studies) = studies
  ### Add the map to the empty vector
  studiess = c(studiess, new_studies)
}

## Get the sample size for each study
### For each disease
sizess = lapply(names(cgps), function(disease) {
  ### For each study of that disease
  studies = names(cgps[[disease]])
  ### Get the number of samples in that study
  sizes = sapply(cgps[[disease]], function(study) {
    length(unlist(study))
  })
  ### Name the sizes vector
  names(sizes) = names(cgps[[disease]])
  return(sizes)
})
sizess = unlist(sizess)

## Get the effect sizes and standard errors associated with each study
effects = results$datasetEffectSizes
ses = results$datasetEffectSizeStandardErrors

## Get the overall effect size and standard error
meta_effects = results$pooledResults[, 'effectSize', drop = FALSE]
meta_ses = results$pooledResults[, 'effectSizeStandardError', drop = FALSE]

## Instantiate a function to create the data frame used for the forest plot of a single feature's signature
get_df = function(feature) {
  ## Create an initial data frame with feature names, effect sizes, and standard error
  df1 = data.frame(study = colnames(effects), effect = effects[feature,], se = 1.96 * ses[feature,])

  ## Add the diseases associated with each study, the feature name, and the study size associated with the study
  df1$disease = factor(studiess[df1$study], c('DM1', 'MS', 'RA', 'sarcoid','summary',''))
  df1$feature = feature
  df1$study.size = sizess[df1$study]

  ## We also create a second data frame that is essentially a blank row to separate the diamond from the dots
  df2 = data.frame(study = '', effect = 0, 
                   se = 0, feature = feature,
                   disease = '', study.size = 0)


  ## We create a third data frame that just contains the meta_analysis effect size for display via a triange
  df3 = data.frame(study = 'Summary', effect = meta_effects[feature, ], 
                   se = 1.96 * meta_ses[feature, ], feature = feature,
                   disease = 'summary', study.size = 50)


  ## We put the data frames together
  df = rbind(df1, df2)
  df = rbind(df, df3)

  ## We put all the studies together into a factor
  df$study = factor(df$study, levels = rev(levels(df$study)))
  return(df)
}

## We choose the features we want to show in the plot, and get their corresponding data frames for plotting
features = grep('^somalogic\\.grey\\.', rownames(meta_effects), value = T)
dfs = lapply(features, get_df)

## We combine these dataframes
df = Reduce(rbind,dfs)
df$feature = factor(df$feature, features)

## We rename the features for easier viewing
levels(df$feature) = levels(df$feature) %>%
  gsub('^somalogic\\.grey\\.', '', .) %>%
  gsub('\\.', ' ', .)

## We manually create the standard ggplot colors
hues = seq(15, 375, length = 6)
colors = hcl(h = hues, l = 65, c = 100)[1:5]

## And create the forest plot
p = ggplot(df, aes(x = effect, y = study, color = disease)) +
  geom_point(aes(size = study.size, shape = disease), show.legend = T) +
  scale_shape_manual(values = c(16, 16, 16, 16, 18, 16)) + # 16 is for a circle and 18 a triangle
  geom_errorbarh(aes(xmin=effect-se, xmax=effect+se), height=0, show.legend = F, size = 1) +
  scale_color_manual(values = c(colors,'transparent')) + # We want the dot at 0 in the empty row to be transparent (we make it 0 to avoid the warnings from using an NA)
  xlab('Effect Size') + ylab('Study') + 
  theme_bw() + geom_vline(xintercept = 0, linetype = 'dashed') + facet_wrap(~feature, nrow = 1) + # We have a dashed line at 0 to represent no effect
  theme(axis.ticks.y = element_blank())

ggsave(SUPPLEMENTAL.FIGURE.5a.OUT.PATH, p, device = 'pdf', height = 6, width = 12)

# Figure 5b -- Barplot of Jamboree result enrichments for surrogate signatures
## We get the results of doing the transcriptional surrogate signature gene set enrichment among features included in the meta-analysis
results = readRDS(META.ANALYSIS.ENRICHMENTS.IN.PATH) 

## We get the pvalues associated with each transcriptional surrogate signature
pvals = sapply(results, function(result) {result$p.value}) 

## And convert to a negative log10 pvalue
negative.log10.pvals = -log10(pvals)


## We create a data frame with the feature names and negative log10 pvalues
df = data.frame(feature = names(pvals), negative.log10.pvals = negative.log10.pvals)
## We arrange the features by pvalue
df = df %>%
  arrange(pvals) %>%
  mutate(feature = gsub("somalogic\\.grey\\.", "", feature)) %>%
  mutate(feature = gsub("tbnks\\.", "", feature)) %>%
  mutate(feature = gsub("beta", "b", feature)) %>%
  mutate(feature = replace(feature, feature == "somalogic.modules.purple", "PM2")) %>%
  mutate(feature = replace(feature, feature == "healthy.index", "Immune Health Metric")) %>%
  mutate(feature = factor(feature, levels = unique(feature)))


source("scripts/util/Plotting/tbnk_featurename_replace.R")
levels(df$feature) <- replace_tbnk_names(levels(df$feature))
#levels(df$feature)

df <- df %>% filter(feature != "microarray.classifier")

## We create the bar plots
p = ggplot(df, aes(x = feature, y = negative.log10.pvals)) + geom_bar(stat = 'identity') + 
  coord_flip() + ylab('-log10(p)') + xlab('Feature') +
  theme_bw() + 
  theme(
    axis.text.x = element_text(size = 15),
    axis.title.x = element_text(size = 15),
    axis.text.y = element_text(size = 15),
    axis.title.y = element_text(size = 15),
    legend.text = element_text(size = 15),
    legend.title = element_text(size = 15)
  )
ggsave(SUPPLEMENTAL.FIGURE.5b.OUT.PATH, p, device = 'pdf', height = 6, width = 6)

# Figure 5c - Venn Diagram comparing significant proteins for aging to proteins significant for the HI signature

## We extract the baltimore aging cohort eset
eset = readRDS(ESET.IN.PATH)
## We get the serum proteomic surrogate signature for the HI
signature = readRDS(SIGNATURE.IN.PATH)
## And the significance table from the ferrucci paper
table = read.table(TABLE.IN.PATH, header = TRUE, sep = '\t')

feature.data = fData(eset)

## We extract the features used in the HI surrogate signature
signature = unname(unlist(signature))
signature = signature[signature %in% feature.data$SomaId] ## And subset to only the somamers analyzed by Ferrucci's group

## We get the pvalues associated with each somamer from the Ferrucci group analysis
pvals = table$p
names(pvals) = table$SomaId
pvals = pvals[p.adjust(pvals, 'fdr') < .05] ## And subset to just the significant proteins

## We collect all the proteins
somamers = feature.data$SomaId

## And make a venn diagram of which proteins overlap and which do not in the significance analysis and in our signature
a = data.frame(`Aging` = somamers %in% names(pvals), `Healthy Index` = somamers %in% signature)
pdf(SUPPLEMENTAL.FIGURE.5c.OUT.PATH, height = 6, width = 6)
vennDiagram(vennCounts(a), names = c('Aging Proteins', 'HI Proteins'))
dev.off()

R Snakemake ggplot2 dplyr Diamond ggpubr Biobase MetaIntegrator From line 2 of Supplemental_Figure_5/supplemental_figure_5_version_4.R

library(Biobase)
source('scripts/util/Signatures/create_signature.R')

# Set globals
## The healthy index for each subject
HI.IN.PATH = snakemake@input[[1]]#'Classification/results/healthy_rf_results_all.RDS'
## The jive PC1 for each subject
PC1.IN.PATH = snakemake@input[[2]]#'Integration_output/jive/subject/prcomp_list.RDS'
## The design matrices
MATRICES.IN.PATH = snakemake@input[[3]]#'Classification/design_matrices/healthy_all_design_matrices_all.RDS'

## The HI serum proteomic surrogate signature
HI.OUT.PATH = snakemake@output[[1]]#'Classification/proteomic_surrogates/healthy.index.surrogates.RDS'
## The PC1 serum proteomic surrogate signature
PC1.OUT.PATH = snakemake@output[[2]]#'Classification/proteomic_surrogates/PC1.score.surrogates.RDS'

# Load files
results = readRDS(HI.IN.PATH)
matrices = readRDS(MATRICES.IN.PATH)

# Get the somalogic features matrix
X = matrices$somalogic.features

# Remove the 'somalogic.features' prefix from the matrix column names
colnames(X) = gsub('somalogic\\.features\\.', '', colnames(X))

# Here, we get the signature for the HI:
# Get the healthy index for each subject 
preds = results$all.modules.plus.grey.with.tbnks
# Ensure that the rows of X correspond to the same subjects in the predictions vector
stopifnot(all(rownames(X) == rownames(preds)))
# Get the HI proteomic surrogate signature
HI.signature = util.make.signature(preds, X)

# Here, we do the same for the PC1 score:
# Extract the jPC1 score
jive = readRDS(PC1.IN.PATH)
joint = jive$joint$x
# Reorder the subjects so they correspond to the rows of the data matrix
joint = joint[rownames(X), , drop = FALSE]
# Choose only the PC1 score
joint = joint[, 'PC1']
# Get the PC1 proteomic surrogate signature
PC1.signature = util.make.signature(joint, X)

# Save results
saveRDS(HI.signature, HI.OUT.PATH)
saveRDS(PC1.signature, PC1.OUT.PATH)

R Biobase From line 5 of analysis/make_proteomic_surrogate_signature.R

MATRICES.IN.PATH = snakemake@input[[1]]#'"Classification/design_matrices/healthy_all_design_matrices_all.RDS"'
## Corresponding meta data
META.IN.PATH = snakemake@input[[2]]#'Classification/meta_data/healthy_random_forest_sample_meta_data_all.RDS'
## GVI permutation testing pvalues
PVALS.IN.PATH = snakemake@input[[3]]#'Classification/results/healthy_rf_pvals_all.RDS'
## Healthy indexes
HI.IN.PATH = snakemake@input[[4]]#'Classification/results/healthy_rf_results_all.RDS'
## PC1 scores
PCS.IN.PATH = snakemake@input[[5]]#"Integration_output/jive/subject/prcomp_list.rds"
## Microarray modules
MICROARRAY.MODULES.IN.PATH = snakemake@input[[6]]#'Data/Microarray/analysis_output/WGCNA/modules.rds'
## Microarray classifier gene signature
MICROARRAY.CLASSIFIER.IN.PATH = snakemake@input[[7]]#'Classification/transcriptional_surrogates/microarray_classifier_signatures.RDS'

## Transcriptional surrogate signatures for HI, PC1, and most significant features from classifier
SIGNATURES.OUT.PATH = snakemake@output[[1]]#'Classification/transcriptional_surrogates/surrogate_signatures.RDS'

# Load utility function
source('scripts/util/Signatures/create_signature.R')

# Initiate signatures list
all.signatures = list()

# Load data and metadata
matrices = readRDS(MATRICES.IN.PATH)
meta = readRDS(META.IN.PATH)

## Get the results of the pvalue permutations
results = readRDS(PVALS.IN.PATH)

# Extract the data used in the classifier, and all of the microarray features
X = matrices$all.modules.plus.grey.with.tbnks
Y = matrices$microarray.features

# Restrict data to only patients
#X = X[meta$condition != 'Healthy', ]
#Y = Y[meta$condition != 'Healthy', ]

# Remove the 'microarray.features.' prefix from the microarray feature names
colnames(Y) = gsub('microarray\\.features\\.', '', colnames(Y))

# First, we get the surrogate signatures for the healthy index driving features

## Get the feature pvalues for the full classifier 
result = results$all.modules.plus.grey.with.tbnks

## Get the features with an FDR of less than .2
features = names(result)[p.adjust(result, 'fdr') < .2]

## For each feature, make a microarray surrogate signature for that feature
signatures = lapply(features, function(feature) {util.make.signature(X[,feature], Y)})

## Name each suggoate signature based on the feature being approximated 
names(signatures) = features

## Add these signatures to the list of signatures 
all.signatures = append(all.signatures, signatures)

# Next, we make a surrogate signature for the healthy index itself

## Read in the healthy indexes
healthy.indexes = readRDS(HI.IN.PATH)

## Subset to only patients
#healthy.indexes = healthy.indexes[meta$condition != "Healthy", ]

## Choose the healthy index from the full classifier
healthy.index = healthy.indexes$all.modules.plus.grey.with.tbnks

## Make the healthy index names the patient ids
names(healthy.index) = rownames(healthy.indexes)

## Ensure that the healthy index subjects are in the same order as the microarray features
stopifnot(all(names(healthy.index) == rownames(Y)))

## Construct the healthy index singature
signature = util.make.signature(healthy.index, Y)

## Add the healthy index signature to the list of signatures
all.signatures[['healthy.index']] = signature

# Now we make a surrogate signature for the PC1 Signature
## Extract the jPC1 scores
jive = readRDS(PCS.IN.PATH)
joint = jive$joint$x
jPC1 = joint[,1]

## Subset and reorder the jPC1 scores to correspond to the subjects / order 
## in the microarray features
jPC1 = jPC1[rownames(Y)]

## Construct the jPC1 surrogate signature
signature = util.make.signature(jPC1, Y)

## Add this signature to the list of signatures
all.signatures[['PC1']] = signature

# Now we add in pseudo-signatures of the microarray modules. As these do not
# require data type conversions, we simply let them be composed of all the genes
# in a given module

## We extract the microarray modules that with significant GVIs in the microarray
## module classifier
result = results$microarray.modules
modules = names(result)[p.adjust(result, 'fdr') < .2]

## We load the microarray modules memberships
module.memberships = readRDS(MICROARRAY.MODULES.IN.PATH)

## For each microarray module that passed the significance threshold
signatures = lapply(modules, function(module) {
  ## We change the name of the module's constituent genes
  ## to eliminate the microarray.modules prefix
  module = gsub('microarray\\.modules\\.', '', module)

  ## We get the genes belonging to the module
  module.members = names(module.memberships)[module.memberships == module]

  ## We return the pseudo-signature
  list(positive = module.members, negative = NULL)
})

## We name the signatures accordingly
names(signatures) = modules

## And add them to our list of signatures
all.signatures = append(all.signatures, signatures)

# Finally, we load and add the microarray logistic regression classifier top features
signature = readRDS(MICROARRAY.CLASSIFIER.IN.PATH)
all.signatures[['microarray.classifier']] = signature

# We remove any empty signatures we may have picked up
all.signatures = all.signatures[sapply(all.signatures, function(signature) {length(unlist(signature))}) > 0]

## And we save the signatures
saveRDS(all.signatures, SIGNATURES.OUT.PATH)

R From line 9 of analysis/make_transcriptional_surrogate_signatures.R

library(Biobase)

# Set paths
## The healthy index serum proteomic surrogate signature
HI.IN.PATH = snakemake@input[[1]]#'Classification/proteomic_surrogates/healthy.index.surrogates.RDS'
## The PC1 serum proteomic surrogate signature
PC1.IN.PATH = snakemake@input[[2]]#'Classification/proteomic_surrogates/PC1.score.surrogates.RDS'
## The somalogic training-level eset
ESET.IN.PATH = snakemake@input[[3]]#'Data/Somalogic/data_analysis_ready/analysis_ready_sample_level_training_somalogic.rds'
## The somalogic somamer table with Plasma and Serum dilutions
REF.TABLE.IN.PATH = snakemake@input[[4]]#'Data/Somalogic/raw/v1/somamer_table.txt'

## The healthy index plasma surrogate signature
HI.SOMAMER.OUT.PATH = snakemake@output[[1]]#'Classification/proteomic_surrogates/healthy.index.plasma.surrogates.RDS'
## The healthy index plasma surrogate signature with names as sequence ids rather than protein names
HI.ID.OUT.PATH = snakemake@output[[2]]#'Classification/proteomic_surrogates/healthy.index.plasma.surrogate.somaId.RDS'
## The PC1 index plasma surrogate signature
PC1.SOMAMER.OUT.PATH = snakemake@output[[3]]#'Classification/proteomic_surrogates/PC1.plasma.surrogates.RDS'
## The PC1 index plasma surrogate signature with names as sequence ids rather than protein names
PC1.ID.OUT.PATH = snakemake@output[[4]]#'Classification/proteomic_surrogates/PC1.plasma.surrogate.somaId.RDS'

# Load data
healthy.index.signature = readRDS(HI.IN.PATH)
PC1.score.signature = readRDS(PC1.IN.PATH)
eset = readRDS(ESET.IN.PATH)

# Read in the reference table with somamer information
ref = read.table(REF.TABLE.IN.PATH, sep = '\t', header = TRUE, comment.char = '', quote = c())

# Get the somamer feature meta data from the eset
feature.meta = fData(eset)

# Create a map from somamer ids to features names
conversion = feature.meta$SomaId
names(conversion) = rownames(feature.meta)

# Create a backward map
inv_conversion = names(conversion)
names(inv_conversion) = conversion

# Instantiate a function to subset a signature to the desired targets
process_signature = function(signature) {
  ## For each half signature (positive and negative), convert the proteins name to somamer ids
  signature.converted = lapply(signature, function(x) {unname(unlist(conversion[x]))})

  ## Make sure that there are no somamers in the signature not included in the reference table
  stopifnot(setdiff(unlist(signature.converted), ref$SomaId) == character(0))

  ## Subset the reference table to just those with the same dilutions in serum and plasma
  ref.subset = ref[ref$CommonDilution == 'No',]

  ## Subset each half signature (positive and negative) to just porteins with the same dilutions in
  ## serum and plasma
  signature.converted = lapply(signature.converted, function(x) {setdiff(x, ref.subset$SomaId)})

  ## Return both half signatures to the original protein names rather than somamer ids
  signature = sapply(signature.converted, function(x) {unname(inv_conversion[x])})

  return(list(targets = signature, somaIds = signature.converted))
}

# Convert the serum HI signature to a plasma HI signature
signature = process_signature(healthy.index.signature)

# Save the results
saveRDS(signature$targets, HI.SOMAMER.OUT.PATH)
saveRDS(signature$somaIds, HI.ID.OUT.PATH)

# Convert the serum PC1 signature to a plasma PC1 signature
signature = process_signature(PC1.score.signature)

# Save the results
saveRDS(signature$targets, PC1.SOMAMER.OUT.PATH)
saveRDS(signature$somaIds, PC1.ID.OUT.PATH)

R Biobase From line 7 of analysis/remove_unwanted_somamers.R

set.seed(140)

# Load libraries and source utility functions
library(Biobase)
library(glmnet)
source('scripts/util/Classification/get_aucs.R')

# Set paths
## Design matrices with various data types
MATRICES.IN.PATH = snakemake@input[[1]]#'Classification/design_matrices/healthy_all_design_matrices_all.RDS'
## Corresponding metdata
META.IN.PATH = snakemake@input[[2]]#'Classification/meta_data/healthy_random_forest_sample_meta_data_all.RDS'

## Microarray classifier gene signature
SIGNATURE.OUT.PATH = snakemake@output[[1]]#'Classification/transcriptional_surrogates/microarray_classifier_signatures.RDS'

# Load design matrices
Xs = readRDS(MATRICES.IN.PATH)
meta = readRDS(META.IN.PATH)

# Extract microarray design matrix 
X = Xs$microarray.features

# Get response vector
y = as.numeric(meta$condition == 'Healthy')

# Scale design matrix
X = scale(X)

# Divide indices into n randomly sampled cross validation groups
xs = 1:length(y)
n = 10
f = factor(xs %% n)
groups = split(sample(xs), f)

# Her we get the cross validation predictions
# For each group of samples
predictions = lapply(groups, function(group) {
  # Subset the design matrix to samples not in that group for training
  X.train = X[-group, , drop = FALSE]
  # Subset the design matrix to just samples in that group for testing
  X.test = X[group, , drop = FALSE]
  # Get the responses vector of the training group
  y.train = y[-group]
  # Train a L2 penalty logistic regression model using the training data
  model = cv.glmnet(X.train, y.train, family = 'binomial', alpha = 0)
  # Get the probabilities that each sample from the test group is of the positive class (healthy)
  predictions = predict(model, X.test)[,'1']
})

# Put predictions into a vector and get corresponding conditions
predictions = unname(unlist(predictions))
conditions = meta$condition[unlist(groups)]

# Get an estimate for the auc
roc = get_roc(predictions, conditions, 'Healthy')
auc = get_auc(roc)
print('Classifier AUC:')
print(auc)

# Get most important m features for the prediction
m = 500
model = cv.glmnet(X, y, family = 'binomial', alpha = 0)

# Here we extract the model coefficients (note that the coefficients with
# highest absolute values should be the most important, as the data was scaled)
coefs = coef(model)
# Convert the coefficents to a vector
coefs = coefs[,1]
# Sort the coefs by absolute value
coefs = coefs[order(abs(coefs), decreasing = TRUE)]
# Remove the intercept term
coefs = coefs[names(coefs) != '(Intercept)']
# Remove the 'microarray.features' prefixes from the genes
names(coefs) = gsub('microarray\\.features\\.', '', names(coefs))
# Extract the top m features from the model
features = names(coefs)[1:m]

# Make a signature based on the most important features
signature = list(positive = features[coefs[features] > 0], negative = features[coefs[features] < 0])

# Save the signature
saveRDS(signature, SIGNATURE.OUT.PATH)

R Biobase glmnet From line 7 of analysis/run_microarray_classifier.R

source('scripts/util/Processing/averageRepeatSamples.R')
source('scripts/util/WGCNA/get_eigengene_scores.R')

# Set paths
## Somalogic modules
MODULES.IN.PATH = snakemake@input[[1]]#'Data/Somalogic/analysis_output/wgcna_results/modules.rds'
## Somalogic subject-level training eset
TRAINING.SET.SOMALOGIC.ESET.IN.PATH = snakemake@input[[2]]#'Data/Somalogic/data_analysis_ready/analysis_ready_subject_level_training_somalogic.rds'
## Somalogic sample-level testing eset
TESTING.SET.SOMALOGIC.ESET.IN.PATH = snakemake@input[[3]]#'Data/Somalogic/data_analysis_ready/analysis_ready_sample_level_testing_somalogic.rds'

## Sample level somalogic module scores for testing set
TESTING.SET.SAMPLE.LEVEL.SCORES.ESET.OUT.PATH = snakemake@output[[1]]#'Data/Somalogic/analysis_output/wgcna_results/scores_sample_level_testing.rds'
## Subject level somalogic module scores for testing set
TESTING.SET.SUBJECT.LEVEL.SCORES.ESET.OUT.PATH = snakemake@output[[2]]#'Data/Somalogic/analysis_output/wgcna_results/scores_subject_level_testing.rds'

# Load data
modules = readRDS(MODULES.IN.PATH)
training.set.somalogic.eset = readRDS(TRAINING.SET.SOMALOGIC.ESET.IN.PATH)
testing.set.somalogic.eset = readRDS(TESTING.SET.SOMALOGIC.ESET.IN.PATH)

# Get the sample level module scores for the testing eset
testing.set.sample.level.scores.eset = get_eigengene_scores(training.set.somalogic.eset, testing.set.somalogic.eset, modules)

# Average over samples within a subject to get the subject level module scores for the testing eset
testing.set.subject.level.scores.eset = averageRepeatSamples(testing.set.sample.level.scores.eset)

# Save results
saveRDS(testing.set.sample.level.scores.eset, TESTING.SET.SAMPLE.LEVEL.SCORES.ESET.OUT.PATH)
saveRDS(testing.set.subject.level.scores.eset, TESTING.SET.SUBJECT.LEVEL.SCORES.ESET.OUT.PATH)

R From line 4 of analysis/get_testing_set_somalogic_module_scores.R

library(WGCNA)
library(Biobase)

# Set seed
set.seed(130)

# Source wgcna utility functions
source('scripts/util/WGCNA/runWGCNA.r')
source('scripts/util/Processing/averageRepeatSamples.R')
source('scripts/util/WGCNA/get_eigengene_scores.R')
source('scripts/util/Processing/removeOutlierPatients.R')

# Set GlobalVariables
## Clean sample-level somalogic data
SAMPLES.IN.PATH = snakemake@input[[1]]#'Data/Somalogic/data_analysis_ready/analysis_ready_sample_level_training_somalogic.rds'

## The somalogic WGCNA feature to module map
MODULES.OUT.PATH = snakemake@output[[1]]#'Data/Somalogic/analysis_output/wgcna_results/modules.rds'
## The sample-level somalogic module scores
SCORES.SAMPLE.LEVEL.OUT.PATH = snakemake@output[[2]]#'Data/Somalogic/analysis_output/wgcna_results/scores_sample_level.rds'
## The subject-level somalogic module scores
SCORES.SUBJECT.LEVEL.OUT.PATH = snakemake@output[[3]]#'Data/Somalogic/analysis_output/wgcna_results/scores_subject_level.rds'
## The variances explained by PC1 of each module
VARIANCES.OUT.PATH = snakemake@output[[4]]#'Data/Somalogic/analysis_output/wgcna_results/variances.rds'
## WGCNA intermediate objects
INTERMEDIATES.OUT.PATH = snakemake@output[[5]]#'Data/Somalogic/analysis_output/wgcna_results/WGCNA_somalogic_intermediates.rds'
## Outlier removal plots
OUTLIER.REMOVAL.PLOTS.OUT.PATH = snakemake@output[[6]]#'Paper_1_Figures/Supplemental_Figure_1/somalogic_outlier_removal_for_wgcna.pdf'
## Diagnostic plots from WGCNA module creation
WGCNA.PLOTS.OUT.PATH = snakemake@output[[7]]#'Paper_1_Figures/Supplemental_Figure_1/somalogic_wgcna_module_creation.pdf'

# Load data
somalogic.samples = readRDS(SAMPLES.IN.PATH)

# Prevent WGCNA from operating with parallel
disableWGCNAThreads()

# Remove outlier samples (and plot results)
pdf(OUTLIER.REMOVAL.PLOTS.OUT.PATH)
somalogic.samples.filtered = removeOutlierPatients(somalogic.samples, cutHeight = 75)
dev.off()

# Calculate the subject level data without outliers
somalogic.subjects = averageRepeatSamples(somalogic.samples.filtered)

# Run wgcna function
## Here we use pamStage = FALSE and method = 'tree' even though these options are set to TRUE
## and 'hybrid' in the tutorial respectively,
## because using the tutorial's options result in lower median module variances explained and a smaller
## number of modules
modules = runWGCNA(somalogic.subjects, OUTDIR, method = 'tree', pamStage = FALSE, pamRespectsDendro = FALSE,
                   beta = 12, deepSplit = 2, minModuleSize = 30, 
                   intermediate.results.path = INTERMEDIATES.OUT.PATH, 
                   diagnostic.plots.path = WGCNA.PLOTS.OUT.PATH)

# Get the scores associated with each sample for each module
scores.sample.level = get_eigengene_scores(somalogic.subjects, somalogic.samples, modules)

# Get the variances explained by PC1 of each module
variances = get_eigengene_variance_explained(somalogic.subjects, modules)

# Average over repeat samples
scores.subject.level = averageRepeatSamples(scores.sample.level)

# Save modules and scores
saveRDS(modules, file = MODULES.OUT.PATH)
saveRDS(scores.sample.level, file = SCORES.SAMPLE.LEVEL.OUT.PATH)
saveRDS(scores.subject.level, file = SCORES.SUBJECT.LEVEL.OUT.PATH)
saveRDS(variances, file = VARIANCES.OUT.PATH)

R Biobase WGCNA From line 7 of analysis/run_somalogic_wgcna.R

library(Biobase)

# Set paths
## The hybrid-normalized, calibration-normalized, and median-normalized RFU outputs from the Somalogic assay
SOMALOGIC.RFUS.IN.PATH = snakemake@input[[1]]#'Data/Somalogic/raw/v1/Cal_QC_CHI_Hyb.Cal.MedNorm_RFU.txt'
## Sample metadata associated with the Somalogic assay
SOMALOGIC.SAMPLES.IN.PATH = snakemake@input[[2]]#'Data/Somalogic/raw/v1/Cal_QC_CHI_Samples.txt'
## Somamer metadata associated with the Somalogic assay
SOMALOGIC.SOMAMERS.IN.PATH = snakemake@input[[3]]#'Data/Somalogic/raw/v1/Cal_QC_CHI_Somamers.txt'
## The monogenic metadata database
DATABASE.IN.PATH = snakemake@input[[4]]#'Metadata/monogenic.de-identified.metadata.RData'

## the Somalogic testing eset
TESTING.OUT.PATH = snakemake@output[[1]]#'Data/Somalogic/processed/v1/testing_somalogic.rds'
## the Somalogic training eset
TRAINING.OUT.PATH = snakemake@output[[2]]#'Data/Somalogic/processed/v1/training_somalogic.rds'
## the Somalogic QC eset
QC.OUT.PATH = snakemake@output[[3]]#'Data/Somalogic/processed/v1/qc_somalogic.rds'

# Define the delimiter for reading files
SEP = '\t'

# Load Data
rfus = read.table(SOMALOGIC.RFUS.IN.PATH, sep = SEP, header = FALSE)
sample_metadata = read.table(SOMALOGIC.SAMPLES.IN.PATH, sep = SEP, stringsAsFactors = FALSE, header = TRUE)
somamer_metadata = read.table(SOMALOGIC.SOMAMERS.IN.PATH, sep = SEP, stringsAsFactors = FALSE, header = TRUE)
load(DATABASE.IN.PATH)

# Change data from a dataframe to a matrix
rfus = as.matrix(rfus)

# Transpose data so that rows are features and columns are samples
rfus = t(rfus)

# Add row names to the meta data
rownames(sample_metadata) = paste(sample_metadata$PlateId, sample_metadata$PlatePosition, sep = '-')
rownames(somamer_metadata) = make.names(somamer_metadata$Target)

# Add row names and column names to the Somalogic RFUs
colnames(rfus) = rownames(sample_metadata)
rownames(rfus) = rownames(somamer_metadata)

# Ensure there are no patients in the somalogic data that are not in the database
stopifnot(all(colnames(rfus) %in% rownames(monogenic.somalogic)))

# Get the sample metadata from the Monogenic Database
sample_metadata = monogenic.somalogic[colnames(rfus),]

# Log transform all the somalogic RFUs
log.rfus = log2(rfus)

# Turn into an expression set
somalogic = ExpressionSet(log.rfus)
phenoData(somalogic) = AnnotatedDataFrame(sample_metadata)
featureData(somalogic) = AnnotatedDataFrame(somamer_metadata)

# Add 'V' and 'P' to visit and patient ids respectively
somalogic$patient_id = paste0('P', as.character(somalogic$patient_id))
somalogic$visit_id = paste0('V', as.character(somalogic$visit_id))

# Subset expression set to training cohort
somalogic.train = somalogic[, somalogic$analysis_group == 'Discovery']
somalogic.test = somalogic[, somalogic$analysis_group == 'Validation']
somalogic.qc = somalogic[, somalogic$analysis_group == 'QC']

# Export Training Somalogic
saveRDS(somalogic.train, file = TRAINING.OUT.PATH)
saveRDS(somalogic.test, file = TESTING.OUT.PATH)
saveRDS(somalogic.qc, file = QC.OUT.PATH)

R Biobase From line 7 of preprocessing/preprocess_somalogic.R

library(WGCNA)
library(Biobase)

# Source utility functions
source('scripts/util/Processing/averageTechnicalReplicates.R')
source('scripts/util/Processing/averageRepeatSamples.R')

# Set Global Variables
## The somalogic testing eset (prior to cleaning)
SAMPLES.IN.PATH = snakemake@input[[1]]#'Data/Somalogic/processed/v1/testing_somalogic.rds'

## The cleaned up sample-level somalogic testing eset
SAMPLE.LEVEL.OUT.PATH = snakemake@output[[1]]#'Data/Somalogic/data_analysis_ready/analysis_ready_sample_level_testing_somalogic.rds'
## The cleaned up subject-level somalogic testing eset
SUBJECT.LEVEL.OUT.PATH = snakemake@output[[2]]#'Data/Somalogic/data_analysis_ready/analysis_ready_subject_level_testing_somalogic.rds'

# Load data
somalogic.samples = readRDS(SAMPLES.IN.PATH)

# Remove unwanted visits for the following reasons:
## V346: hemolysis level 3
## V357: hemolysis level 2
## V354: cloudy
samples.to.remove = c('V346', 'V354', 'V357')
somalogic.samples = somalogic.samples[, ! somalogic.samples$visit_id %in% samples.to.remove]

# Remove unwanted somamers for the following reasons
## EGFRvIII: Removed from Somalogic panel, found to be cross reactive to an unknown source
features.to.remove = c('EGFRvIII')
somalogic.samples = somalogic.samples[! rownames(somalogic.samples) %in% features.to.remove,]

# Average technical replicates
somalogic.samples = averageTechnicalReplicates(somalogic.samples,
                                               visit.id.col = 'visit_id',
                                               meta.cols = c('patient_id',
                                                             'gender',
                                                             'patient_age_at_time_of_blood_draw',
                                                             'race',
                                                             'condition',
                                                             'ethnicity',
                                                             'plate_id',
                                                             'assay_desc',
                                                             'visit_type'))

# Rename columns to the visit id
colnames(somalogic.samples) = somalogic.samples$visit_id

# Collapse samples into subject through averaging
somalogic.subjects = averageRepeatSamples(somalogic.samples)

# Save results
saveRDS(somalogic.samples, file = SAMPLE.LEVEL.OUT.PATH)
saveRDS(somalogic.subjects, file = SUBJECT.LEVEL.OUT.PATH)

R Biobase WGCNA From line 7 of preprocessing/ready_somalogic_testing.R

library(WGCNA)
library(Biobase)

# Source utility functions
source('scripts/util/Processing/averageTechnicalReplicates.R')
source('scripts/util/Processing/averageRepeatSamples.R')

# Set Global Variables
## The somalogic training eset (prior to cleaning)
SAMPLES.IN.PATH = snakemake@input[[1]]#'Data/Somalogic/processed/v1/training_somalogic.rds'

## The cleaned up sample-level somalogic training eset
SAMPLE.LEVEL.OUT.PATH = snakemake@output[[1]]#'Data/Somalogic/data_analysis_ready/analysis_ready_sample_level_training_somalogic.rds'
## The cleaned up sample-level somalogic training eset
SUBJECT.LEVEL.OUT.PATH = snakemake@output[[2]]#'Data/Somalogic/data_analysis_ready/analysis_ready_subject_level_training_somalogic.rds'

# Load data
somalogic.samples = readRDS(SAMPLES.IN.PATH)

# Remove unwanted visits for the following reasons:
## V313: hemolysis level 4
## V282: hemolysis level 4
## V210: very odd cloudy/milky sample
samples.to.remove = c('V313', 'V282', 'V210')
somalogic.samples = somalogic.samples[, ! somalogic.samples$visit_id %in% samples.to.remove]

# Remove unwanted somamers for the following reasons
## EGFRvIII: Removed from Somalogic panel, found to be cross reactive to an unknown source
features.to.remove = c('EGFRvIII')
somalogic.samples = somalogic.samples[! rownames(somalogic.samples) %in% features.to.remove,]

# Average technical replicates
somalogic.samples = averageTechnicalReplicates(somalogic.samples,
                                               visit.id.col = 'visit_id',
                                               meta.cols = c('patient_id',
                                                             'gender',
                                                             'patient_age_at_time_of_blood_draw',
                                                             'race',
                                                             'condition',
                                                             'ethnicity',
                                                             'plate_id',
                                                             'assay_desc',
                                                             'visit_type'))

# Rename columns to the visit id
colnames(somalogic.samples) = somalogic.samples$visit_id

# Collapse samples into subject through averaging
somalogic.subjects = averageRepeatSamples(somalogic.samples)

# Save results
saveRDS(somalogic.samples, file = SAMPLE.LEVEL.OUT.PATH)
saveRDS(somalogic.subjects, file = SUBJECT.LEVEL.OUT.PATH)

R Biobase WGCNA From line 7 of preprocessing/ready_somalogic_training.R

set.seed(131)

# Load libraries
library(variancePartition)
library(Biobase)
source('scripts/util/VariancePartition/variancePartition.R')

# Set input paths
## Sample-level eset input paths
ESET.IN.PATHS = list(
  somalogic.features = snakemake@input[[1]],#'Data/Somalogic/data_analysis_ready/analysis_ready_sample_level_training_somalogic.rds',
  somalogic.modules = snakemake@input[[2]],#'Data/Somalogic/analysis_output/wgcna_results/scores_sample_level.rds',
  microarray.features = snakemake@input[[3]],#'Data/Microarray/data_analysis_ready/eset_batch_training_sample.rds',
  microarray.modules = snakemake@input[[4]],#'Data/Microarray/analysis_output/WGCNA/array_sample_scores.rds',
  tbnks = snakemake@input[[5]]#'Data/TBNK/data_analysis_ready/tbnk_training_sample_level_eset.rds'
)

# Set output paths
## Variance partitions into patient-explained variance and residual variance
VP.OUT.PATHS = list(
  somalogic.features = snakemake@output[[1]],#'Data/Somalogic/analysis_output/stability/somalogic_features_standard_vp.rds',
  somalogic.modules = snakemake@output[[2]],#'Data/Somalogic/analysis_output/stability/somalogic_modules_standard_vp.rds',
  microarray.features = snakemake@output[[3]],#'Data/Microarray/analysis_output/stability/microarray_features_standard_vp.rds',
  microarray.modules = snakemake@output[[4]],#'Data/Microarray/analysis_output/stability/microarray_modules_standard_vp.rds',
  tbnks = snakemake@output[[5]]#'Data/TBNK/analysis_output/stability/tbnks_features_standard_vp.rds'
)

# Load data
esets = lapply(ESET.IN.PATHS, readRDS)

# Run variance partitions
print(VP.OUT.PATHS)
vps = lapply(esets, patient_id_variance_partition)

# Save results
mapply(function(vp, out.path) {
  saveRDS(vp, out.path)
}, vps, VP.OUT.PATHS)

R Biobase variancePartition From line 4 of analysis/run_variance_partitions.R

library(variancePartition)
library(Biobase)

# We set the global paths
## Sample level esets
ESET.SAMPLE.LEVEL.IN.PATHS = list(
  somalogic.features = snakemake@input[[1]],#'Data/Somalogic/data_analysis_ready/analysis_ready_sample_level_training_somalogic.rds',
  somalogic.modules = snakemake@input[[2]],#'Data/Somalogic/analysis_output/wgcna_results/scores_sample_level.rds',
  microarray.features = snakemake@input[[3]],#'Data/Microarray/data_analysis_ready/eset_batch_training_sample.rds',
  microarray.modules = snakemake@input[[4]],#'Data/Microarray/analysis_output/WGCNA/array_sample_scores.rds',
  tbnks = snakemake@input[[5]]#'Data/TBNK/data_analysis_ready/tbnk_training_sample_level_eset.rds'
)

## Subject level esets
ESET.SUBJECT.LEVEL.IN.PATHS = list(
  somalogic.features = snakemake@input[[6]],#'Data/Somalogic/data_analysis_ready/analysis_ready_subject_level_training_somalogic.rds',
  somalogic.modules = snakemake@input[[7]],#'Data/Somalogic/analysis_output/wgcna_results/scores_subject_level.rds',
  microarray.features = snakemake@input[[8]],#Data/Microarray/data_analysis_ready/eset_batch_training_subject.rds',
  microarray.modules = snakemake@input[[9]],#'Data/Microarray/analysis_output/WGCNA/array_subject_scores.rds',
  tbnks = snakemake@input[[10]]#'Data/TBNK/data_analysis_ready/tbnk_training_subject_level_eset.rds'
)

## Variance partitions into patient-explained variance and residual variance
VP.IN.PATHS = list(
  somalogic.features = snakemake@input[[11]],#'Data/Somalogic/analysis_output/stability/somalogic_features_standard_vp.rds',
  somalogic.modules = snakemake@input[[12]],#'Data/Somalogic/analysis_output/stability/somalogic_modules_standard_vp.rds',
  microarray.features = snakemake@input[[13]],#'Data/Microarray/analysis_output/stability/microarray_features_standard_vp.rds',
  microarray.modules = snakemake@input[[14]],#'Data/Microarray/analysis_output/stability/microarray_modules_standard_vp.rds',
  tbnks = snakemake@input[[15]]#'Data/TBNK/analysis_output/stability/tbnks_features_standard_vp.rds'
)

## Sample level esets, subset to just stable features
STABLE.ESET.SAMPLE.LEVEL.OUT.PATHS = list(
  somalogic.features = snakemake@output[[1]],#'Data/Somalogic/analysis_output/stability/stable_somalogic_sample_level_features.rds',
  somalogic.modules = snakemake@output[[2]],#'Data/Somalogic/analysis_output/stability/stable_somalogic_sample_level_modules.rds',
  microarray.features = snakemake@output[[3]],#'Data/Microarray/analysis_output/stability/stable_microarray_sample_level_features.rds',
  microarray.modules = snakemake@output[[4]],#'Data/Microarray/analysis_output/stability/stable_microarray_sample_level_modules.rds',
  tbnks = snakemake@output[[5]]#'Data/TBNK/analysis_output/stability/stable_tbnk_sample_level_features.rds'
)

## Subject level esets, subset to just stable features
STABLE.ESET.SUBJECT.LEVEL.OUT.PATHS = list(
  somalogic.features = snakemake@output[[6]],#'Data/Somalogic/analysis_output/stability/stable_somalogic_subject_level_features.rds',
  somalogic.modules = snakemake@output[[7]],#'Data/Somalogic/analysis_output/stability/stable_somalogic_subject_level_modules.rds',
  microarray.features = snakemake@output[[8]],#'Data/Microarray/analysis_output/stability/stable_microarray_subject_level_features.rds',
  microarray.modules = snakemake@output[[9]],#'Data/Microarray/analysis_output/stability/stable_microarray_subject_level_modules.rds',
  tbnks = snakemake@output[[10]]#'Data/TBNK/analysis_output/stability/stable_tbnk_subject_level_features.rds'
)

# Load the data
sample.esets = lapply(ESET.SAMPLE.LEVEL.IN.PATHS, readRDS)
subject.esets = lapply(ESET.SUBJECT.LEVEL.IN.PATHS, readRDS)
vps = lapply(VP.IN.PATHS, readRDS)

# Make a function to subset an eset to just the features for which
# the patient covariate explains at least half the total variance
select.stable = function(eset, vp) {
  stable.eset = eset[rownames(vp)[vp$Patient >= .5], ]
  return(stable.eset)
}

# Initate a function to save the esets
save.eset = function(eset, path) {
  saveRDS(eset, path)
  return('eset saved')
}

# Subset esets
stable.sample.esets = mapply(select.stable, sample.esets, vps, SIMPLIFY = FALSE)
stable.subject.esets = mapply(select.stable, subject.esets, vps, SIMPLIFY = FALSE)

# Save esets
out = mapply(save.eset, stable.sample.esets, STABLE.ESET.SAMPLE.LEVEL.OUT.PATHS)
out = mapply(save.eset, stable.subject.esets, STABLE.ESET.SUBJECT.LEVEL.OUT.PATHS)

R Biobase variancePartition From line 5 of analysis/select_stable_features.R

library(Biobase)

# Set globals
## Monogenic metadata database
DATABASE.IN.PATH = snakemake@input[[1]]#'Metadata/monogenic.de-identified.metadata.RData'

## the tbnk training eset (prior to cleaning)
TRAINING.OUT.PATH = snakemake@output[[1]]#'Data/TBNK/processed/tbnk_eset_training.rds'
## the tbnk testing eset (prior to cleaning)
TESTING.OUT.PATH = snakemake@output[[2]]#'Data/TBNK/processed/tbnk_eset_testing.rds'

# Load monogenic database and extract tbnks
load(DATABASE.IN.PATH)
tbnks = monogenic.tbnk

# Make Matrix

## Convert patients and visit ids to the 'P_' and 'V_' format
tbnks$patient_id = paste0('P', as.numeric(tbnks$patient_id))
tbnks$visit_id = paste0('V', as.numeric(tbnks$visit_id))

## Make df with only CBC/TBNK parameters (i.e. remove meta data columns)
unit.columns = grep('uom', colnames(tbnks), value = T)
range.columns = grep('range', colnames(tbnks), value = T)
features.to.remove = c(unit.columns, range.columns, c('patient_id', 'visit_id', 'version'))
features = setdiff(colnames(tbnks), features.to.remove)
X = tbnks[, features]

## Name the df rows based on the visit ids
rownames(X) = tbnks$visit_id

## Remove any features that are NA for all samples (i.e. the BANDs)
X = X[, apply(X, 2, function(x) {! all(is.na(x)) })]

## Convert the df to a matrix and transpose it so it is samples x features
X = t(as.matrix(X))

# Make PhenoData

## Choose columns from the monogenic database we wish to use as metadata
meta.features = c('patient_id', 
                  'visit_id',
                  'condition',
                  'patient_age_at_time_of_blood_draw',
                  'gender','race',
                  'ethnicity',
                  'analysis_group',
                  'visit_type')

## Get the corresponding columns from the monogenic database
meta.data = monogenic.all.assays[meta.features]
meta.data = unique(meta.data)
meta.data = meta.data[!is.na(meta.data$visit_id), ]

## Convert patients and visit ids to the 'P_' and 'V_' format
meta.data$visit_id = paste0('V', as.character(meta.data$visit_id))
meta.data$patient_id = paste0('P', as.character(meta.data$patient_id))

## Name the metadata rows based on the visit ids
rownames(meta.data) = meta.data$visit_id

## Select only the visit ids included in the TBNK data
meta.data = meta.data[colnames(X), ]

# Make Feature Data

## Extract units from the units columns in the included features
uoms = sapply(rownames(X), function(feature) {
  name = paste0(feature,'_uom')
  if(name %in% colnames(tbnks)) {
    return(unique(na.omit(tbnks[[name]])))
  } else {
    return(NA)
  }
})

## Make a data frame from the units of measure
f.data = data.frame(uoms = uoms, stringsAsFactors = FALSE)
rownames(f.data) = rownames(X)

## If the units of a feature are missing (i.e. for the TBNKs) and the feature is an absolute quantity,
## make the units that of a concentration
f.data[grepl('_count', rownames(f.data)) & is.na(f.data$uoms), 'uoms'] = '/uL'
## If the units of a feature are missing (i.e. for the TBNKs) and the feature is a relative,
## make the units a percentage
f.data[(!grepl('_count', rownames(f.data))) & is.na(f.data$uoms), 'uoms'] = '%'

# Put together expression set
tbnks.eset = ExpressionSet(X)
phenoData(tbnks.eset) = AnnotatedDataFrame(meta.data)
featureData(tbnks.eset) = AnnotatedDataFrame(f.data)

# Split eset into training and testing
tbnks.train.eset = tbnks.eset[, tbnks.eset$analysis_group == 'Discovery']
tbnks.test.eset = tbnks.eset[, tbnks.eset$analysis_group == 'Validation']

# Save outputs
saveRDS(tbnks.train.eset, file = TRAINING.OUT.PATH)
saveRDS(tbnks.test.eset, file = TESTING.OUT.PATH)

R Biobase From line 5 of preprocessing/create_tbnk_eset.R

library(Biobase)

# Source utilities
source('scripts/util/Processing/averageRepeatSamples.R')

# Set globals
## the tbnk testing eset (prior to cleaning)
TBNKS.IN.PATH = snakemake@input[[1]]#'Data/TBNK/processed/tbnk_eset_testing.rds'

## the cleaned up sample-level tbnk testing eset
TBNKS.SAMPLE.LEVEL.OUT.PATH = snakemake@output[[1]]#'Data/TBNK/data_analysis_ready/tbnk_testing_sample_level_eset.rds'
## the cleaned up subject-level tbnk testing eset
TBNKS.SUBJECT.LEVEL.OUT.PATH = snakemake@output[[2]]#'Data/TBNK/data_analysis_ready/tbnk_testing_subject_level_eset.rds'

# Load data
tbnks = readRDS(TBNKS.IN.PATH)

# Convert all features expressed in percent lymphocytes
# to percent of WBCs (and also rename them with 'percent')
lymphocyte.subsets = c('cd3', 'cd4_cd3', 'cd8_cd3', 'cd19', 'nk_cells')
lymphocytes.percent = exprs(tbnks)['lymphocytes_percent', ]
for(feature in lymphocyte.subsets) {
  exprs(tbnks)[feature, ] = exprs(tbnks)[feature, ] * lymphocytes.percent / 100
  rownames(tbnks)[rownames(tbnks) == feature] = paste0(feature, '_percent')
}

# Rename _count to _abs for lymphocytes
rownames(tbnks) = gsub('_count', '_abs', rownames(tbnks))

# Choose the features we wish to include (note platelets are included as a 'population')
features = rownames(tbnks)
marrow.features = c('wbc', 'rbc', 'hemoglobin', 'mcv', 'mch', 'mchc', 'rdw')
absolute.population.features = grep('_abs', features, value = T)
relative.population.features = grep('_percent', features, value = T)
features.to.include = c(marrow.features, absolute.population.features, relative.population.features)

# Manually remove some additional features for the following reasons
# immature_granulocytes_abs: many samples were NA
# immature_granulocytes_percent: many samples were NA
# mpv: many samples were NA
# nucleated_rbc_abs: too few to be reliable
# nucleated_rbc_percent: too few to be reliable 
features.to.exclude = c('immature_granulocytes_abs',
                        'immature_granulocytes_percent',
                        'nucleated_rbc_abs',
                        'nucleated_rbc_percent',
                        'mpv')
features.to.include = setdiff(features.to.include, features.to.exclude)

# Subset to the desired features
tbnks = tbnks[features.to.include,]

# Only use samples in which all desired features have complete data
tbnks = tbnks[, complete.cases(t(exprs(tbnks)))]

# Add in the neutrophil to lymphocyte ratio to the features and feature data
df = as.data.frame(t(exprs(tbnks)))
df$'NLR' = df[['neutrophil_abs']] / df[['lymphocytes_abs']] * 100
f.data = fData(tbnks)
f.data['NLR',] = '%'

# Convert the data frame back to a matrix and transpose
X = t(as.matrix(df))

# Wrap everything up into an expression set
tbnks.samples = ExpressionSet(X)
phenoData(tbnks.samples) = phenoData(tbnks)
featureData(tbnks.samples) = AnnotatedDataFrame(f.data)

# Average over biological repeats
tbnks.subjects = averageRepeatSamples(tbnks.samples)

# Save esets
saveRDS(tbnks.samples, TBNKS.SAMPLE.LEVEL.OUT.PATH)
saveRDS(tbnks.subjects, TBNKS.SUBJECT.LEVEL.OUT.PATH)

R Biobase From line 12 of preprocessing/ready_tbnks_testing.R

library(Biobase)

# Source utilities
source('scripts/util/Processing/averageRepeatSamples.R')

# Set globals
## the tbnk training eset (prior to cleaning)
TBNKS.IN.PATH = snakemake@input[[1]]#'Data/TBNK/processed/tbnk_eset_training.rds'

## the cleaned up sample-level tbnk training eset
TBNKS.SAMPLE.LEVEL.OUT.PATH = snakemake@output[[1]]#'Data/TBNK/data_analysis_ready/tbnk_training_sample_level_eset.rds'
## the cleaned up subject-level tbnk training eset
TBNKS.SUBJECT.LEVEL.OUT.PATH = snakemake@output[[2]]#'Data/TBNK/data_analysis_ready/tbnk_training_subject_level_eset.rds'

# Load data
tbnks = readRDS(TBNKS.IN.PATH)

# Remove unwanted samples for the following reasons
# V316 & V318: Lymphocyte subpopulations reported in TBNKs do not sum to total lymphocytes
# V40: Lymphocyte percentages reported in TBNKs are not equal to absolute population counts divided by total lymphocytes
outliers = c('V40', 'V316','V318')
tbnks = tbnks[, ! tbnks$visit_id %in% outliers]

# Convert all features expressed in percent lymphocytes
# to percent of WBCs (and also rename them with 'percent')
lymphocyte.subsets = c('cd3', 'cd4_cd3', 'cd8_cd3', 'cd19', 'nk_cells')
lymphocytes.percent = exprs(tbnks)['lymphocytes_percent', ]
for(feature in lymphocyte.subsets) {
  exprs(tbnks)[feature, ] = exprs(tbnks)[feature, ] * lymphocytes.percent / 100
  rownames(tbnks)[rownames(tbnks) == feature] = paste0(feature, '_percent')
}

# Rename _count to _abs for lymphocytes
rownames(tbnks) = gsub('_count', '_abs', rownames(tbnks))

# Choose the features we wish to include (note platelets are included as a 'population')
features = rownames(tbnks)
marrow.features = c('wbc', 'rbc', 'hemoglobin', 'mcv', 'mch', 'mchc', 'rdw')
absolute.population.features = grep('_abs', features, value = T)
relative.population.features = grep('_percent', features, value = T)
features.to.include = c(marrow.features, absolute.population.features, relative.population.features)

# Manually remove some additional features for the following reasons
# immature_granulocytes_abs: many samples were NA
# immature_granulocytes_percent: many samples were NA
# mpv: many samples were NA
# nucleated_rbc_abs: too few counts to be reliable
# nucleated_rbc_percent: too few counts to be reliable 
features.to.exclude = c('immature_granulocytes_abs',
                        'immature_granulocytes_percent',
                        'nucleated_rbc_abs',
                        'nucleated_rbc_percent',
                        'mpv')
features.to.include = setdiff(features.to.include, features.to.exclude)

# Subset to the desired features
tbnks = tbnks[features.to.include,]

# Only use samples in which all desired features have complete data
tbnks = tbnks[, complete.cases(t(exprs(tbnks)))]

# Add in the neutrophil to lymphocyte ratio to the features and feature data
df = as.data.frame(t(exprs(tbnks)))
df$'NLR' = df[['neutrophil_abs']] / df[['lymphocytes_abs']] * 100
f.data = fData(tbnks)
f.data['NLR',] = '%'

# Convert the data frame back to a matrix and transpose
X = t(as.matrix(df))

# Wrap everything up into an expression set
tbnks.samples = ExpressionSet(X)
phenoData(tbnks.samples) = phenoData(tbnks)
featureData(tbnks.samples) = AnnotatedDataFrame(f.data)

# Average over biological repeats
tbnks.subjects = averageRepeatSamples(tbnks.samples)

# Save esets
saveRDS(tbnks.samples, TBNKS.SAMPLE.LEVEL.OUT.PATH)
saveRDS(tbnks.subjects, TBNKS.SUBJECT.LEVEL.OUT.PATH)

R Biobase From line 12 of preprocessing/ready_tbnks_training.R

library(ggplot2)
library(limma)
library(doParallel)
library(variancePartition)
library(Biobase)

# Source utilities
source('scripts/util/VariancePartition/variancePartition.R')

# Set Globals
## Path to the medication inforamtion on patients
MEDICATIONS.IN.PATH = snakemake@input[[1]]#'Medications/medications.types.rds'

## Path to the sample-level expression sets for each dataset
ESETS.IN.PATHS = list(
  somalogic.features = snakemake@input[[2]],#'Data/Somalogic/data_analysis_ready/analysis_ready_sample_level_training_somalogic.rds',
  somalogic.modules = snakemake@input[[3]],#'Data/Somalogic/analysis_output/wgcna_results/scores_sample_level.rds',
  microarray.features = snakemake@input[[4]],#'Data/Microarray/data_analysis_ready/eset_batch_training_sample.rds',
  microarray.modules = snakemake@input[[5]],#'Data/Microarray/analysis_output/WGCNA/array_sample_scores.rds',
  tbnks = snakemake@input[[6]]#'Data/TBNK/data_analysis_ready/tbnk_training_sample_level_eset.rds'
)

## Paths to save the variance partition results
VP.OUT.PATHS = list(
  somalogic.features = snakemake@output[[1]],#'Data/Somalogic/analysis_output/variance_decomposition/somalogic_features_vp.RDS',
  somalogic.modules = snakemake@output[[2]],#'Data/Somalogic/analysis_output/variance_decomposition/somalogic_modules_vp.RDS',
  microarray.features = snakemake@output[[3]],#'Data/Microarray/analysis_output/variance_decomposition/microarray_features_vp.RDS',
  microarray.modules = snakemake@output[[4]],#'Data/Microarray/analysis_output/variance_decomposition/microarray_modules_vp.RDS',
  tbnks = snakemake@output[[5]]#'Data/TBNK/analysis_output/variance_decomposition/tbnk_features_vp.RDS'
)

# Load data
medications = readRDS(MEDICATIONS.IN.PATH)
esets = lapply(ESETS.IN.PATHS, readRDS)

# Remove unwanted medications and patient info columns from the medications matrix
medications.to.remove = c('Procedure', 'Surgery', 'Transfusion', 'Transplantation', 'Other')
columns.to.remove = c('patient_id','visit_id', medications.to.remove)
medications = medications[, setdiff(colnames(medications), columns.to.remove)]

# Get a matrix of medications that corresponds to the visits in each data set
medication.matrices = lapply(esets, function(eset) {
  medications[colnames(eset), ]
})

# Run variance partitions
vps = mapply(function(eset, medication.matrix) {
  condition_medication_variance_partition(eset, medication.matrix)
}, esets, medication.matrices, SIMPLIFY = FALSE)

# Save results
mapply(function(vp, out.path) {saveRDS(vp, out.path)}, vps, VP.OUT.PATHS)

R ggplot2 Biobase doParallel variancePartition From line 5 of analysis/variance_partitions.R

script:
    "scripts/Microarray/0_rma/rma.R"

SnakeMake From line 109 of main/Snakefile

script: 
    "scripts/Microarray/0_rma/add_pdata.R"

SnakeMake From line 118 of main/Snakefile

script:
    "scripts/Microarray/2_probeset/get_anno_pick_probes.R" 

SnakeMake From line 128 of main/Snakefile

shell:
    "Rscript -e \"rmarkdown::render(\'scripts/Microarray/3_filtering/pre_filtering.Rmd\', "
    "params = list(eset=\'{input.eset}\', picked_probes=\'{input.picked_probes}\', probe_anno=\'{input.probe_anno}\',"
    "training_sample=\'{output.training_sample}\', training_subject=\'{output.training_subject}\',"
    "qc=\'{output.qc}\', val_sample=\'{output.val_sample}\', val_subject=\'{output.val_subject}\',"
    "batch_training_sample=\'{output.batch_training_sample}\', batch_training_subject=\'{output.batch_training_subject}\',"
    "batch_qc=\'{output.batch_qc}\', batch_val_sample=\'{output.batch_val_sample}\', batch_val_subject=\'{output.batch_val_subject}\'"
    "))\""

SnakeMake From line 150 of main/Snakefile

    script:
        "scripts/Microarray/4_WGCNA/wgcna.R" #This requires a good bit of memory, so if you run into problems try upping the memory in the cluster_config

#jive
rule jive_subject_all:
    input:
        array_in = "Pipeline_out/Data/Microarray/analysis_output/stability/stable_microarray_subject_level_features.rds",
        soma_in = "Pipeline_out/Data/Somalogic/analysis_output/stability/stable_somalogic_subject_level_features.rds"

SnakeMake From line 173 of main/Snakefile

script:
    "scripts/jive/jive_run/jive_subject_level_run.R"

SnakeMake From line 183 of main/Snakefile

script:
    "scripts/jive/jive_run/jive_subject_level_run_onlyHealthy.R"

SnakeMake From line 192 of main/Snakefile

script:
    "scripts/jive/jive_run/jive_subject_level_run_noHealthy.R"

SnakeMake From line 201 of main/Snakefile

script:
    "scripts/jive/pca_run/jive_pca_run.R"

SnakeMake From line 209 of main/Snakefile

script:
    "scripts/jive/enrich/jive_pca_enrich_wdirection.R"

SnakeMake From line 222 of main/Snakefile

script:
    "scripts/Classification/preprocessing/create_classifier_groups.R"

SnakeMake From line 236 of main/Snakefile

script:
    "scripts/Classification/preprocessing/create_design_matrices.R"

SnakeMake From line 253 of main/Snakefile

script:
    "scripts/Classification/preprocessing/subset_design_matrices.R"

SnakeMake From line 262 of main/Snakefile

script:
     "scripts/Classification/preprocessing/design_mat_no_pm2.R"

SnakeMake From line 270 of main/Snakefile

script:
    "scripts/Classification/analysis/run_random_forests.R"

SnakeMake From line 283 of main/Snakefile

script:
    "scripts/Classification/analysis/run_random_forests.R"

SnakeMake From line 296 of main/Snakefile

script:
    "scripts/Classification/analysis/run_gvi_permutations.R"

SnakeMake From line 309 of main/Snakefile

script:
    "scripts/Classification/analysis/get_feature_pvalues.R"

SnakeMake From line 318 of main/Snakefile

script:
    "scripts/Stability/analysis/select_stable_features.R"

SnakeMake From line 350 of main/Snakefile

script:
    "scripts/Classification/preprocessing/create_design_matrices.R"

SnakeMake From line 366 of main/Snakefile

script:
    "scripts/Classification/preprocessing/subset_design_matrices.R"

SnakeMake From line 375 of main/Snakefile

script:
    "scripts/Classification/analysis/get_testing_set_HI.R"

SnakeMake From line 384 of main/Snakefile

script:
    "scripts/Somalogic/preprocessing/preprocess_somalogic.R"

SnakeMake From line 397 of main/Snakefile

script:
    "scripts/Somalogic/preprocessing/ready_somalogic_training.R"

SnakeMake From line 407 of main/Snakefile

script:
    "scripts/Somalogic/preprocessing/ready_somalogic_testing.R"

SnakeMake From line 417 of main/Snakefile

script:
    "scripts/Somalogic/analysis/run_somalogic_wgcna.R"

SnakeMake From line 432 of main/Snakefile

script:
    "scripts/TBNK/preprocessing/create_tbnk_eset.R"

SnakeMake From line 442 of main/Snakefile

script:
    "scripts/TBNK/preprocessing/ready_tbnks_training.R"

SnakeMake From line 452 of main/Snakefile

script:
    "scripts/TBNK/preprocessing/ready_tbnks_testing.R"

SnakeMake From line 462 of main/Snakefile

script:
    "scripts/Medications/preprocessing/clean_medications.R"

SnakeMake From line 472 of main/Snakefile

script:
    "scripts/Stability/analysis/run_variance_partitions.R"

SnakeMake From line 489 of main/Snakefile

script:
    "scripts/Stability/analysis/select_stable_features.R"

SnakeMake From line 521 of main/Snakefile

script:
    "scripts/Signatures/analysis/make_proteomic_surrogate_signature.R"

SnakeMake From line 533 of main/Snakefile

script:
    "scripts/Signatures/analysis/remove_unwanted_somamers.R"

SnakeMake From line 547 of main/Snakefile

script:
    "scripts/Signatures/analysis/run_microarray_classifier.R"

SnakeMake From line 557 of main/Snakefile

script:
    "scripts/Signatures/analysis/make_transcriptional_surrogate_signatures.R"

SnakeMake From line 572 of main/Snakefile

script:
    "scripts/DifferentialExpression/analysis/get_DE_fits.R"

SnakeMake From line 589 of main/Snakefile

script:
    "scripts/DifferentialExpression/feature_geneset_enrichments/run_enrich_camera.R"

SnakeMake From line 598 of main/Snakefile

script:
    "scripts/DifferentialExpression/analysis/get_DE_stats.R"

SnakeMake From line 625 of main/Snakefile

script:
    "scripts/DifferentialExpression/analysis/get_DE_fits_without_gamma_subjects.R"

SnakeMake From line 643 of main/Snakefile

script:
    "scripts/DifferentialExpression/analysis/get_DE_stats.R"

SnakeMake From line 670 of main/Snakefile

script:
    "scripts/DifferentialExpression/analysis/get_sex_based_DE_fits.R"

SnakeMake From line 686 of main/Snakefile

script:
    "scripts/DifferentialExpression/analysis/get_sex_based_DE_stats.R"

SnakeMake From line 713 of main/Snakefile

script:
    "scripts/BaltimoreCohortValidation/preprocessing/process_ferrucci_data.R"

SnakeMake From line 724 of main/Snakefile

script:
    "scripts/MetaAnalysis/preprocessing/get_series.R"

SnakeMake From line 733 of main/Snakefile

script:
    "scripts/MetaAnalysis/preprocessing/get_cgp_info.R"

SnakeMake From line 745 of main/Snakefile

script:
    "scripts/MetaAnalysis/preprocessing/get_jamboree_data.R"

SnakeMake From line 758 of main/Snakefile

script:
    "scripts/MetaAnalysis/preprocessing/clean_cgps.R"

SnakeMake From line 768 of main/Snakefile

script:
    "scripts/MetaAnalysis/preprocessing/clean_jamboree_data.R"

SnakeMake From line 778 of main/Snakefile

script:
    "scripts/MetaAnalysis/preprocessing/get_jamboree_scores.R"

SnakeMake From line 790 of main/Snakefile

script:
    "scripts/MetaAnalysis/analysis/results_scores.R"

SnakeMake From line 799 of main/Snakefile

script:
    "scripts/MetaAnalysis/analysis/results_enrichments.R"

SnakeMake From line 810 of main/Snakefile

script:
    "scripts/Enrichments/preprocessing/make_gene_sets.R"

SnakeMake From line 826 of main/Snakefile

script:
    "scripts/Enrichments/preprocessing/process_protein_atlas.R"

SnakeMake From line 835 of main/Snakefile

script:
    "scripts/Enrichments/analysis/microarray_modules_gene_set_enrichments.R"

SnakeMake From line 845 of main/Snakefile

script:
    "scripts/Enrichments/analysis/somalogic_modules_gene_set_enrichments.R"

SnakeMake From line 858 of main/Snakefile

script:
    "scripts/Enrichments/analysis/transcriptional_signature_enrichments.R"

SnakeMake From line 869 of main/Snakefile

script:
    "scripts/VarianceDecomposition/analysis/variance_partitions.R"

SnakeMake From line 887 of main/Snakefile

script:
    "scripts/Classification/analysis/get_PID_based_HI_on_AI_subjects.R"

SnakeMake From line 898 of main/Snakefile

script:
    "scripts/Classification/analysis/get_AI_based_HI_on_PID_subjects.R"

SnakeMake From line 909 of main/Snakefile

script:
    "scripts/Somalogic/analysis/get_testing_set_somalogic_module_scores.R"

SnakeMake From line 921 of main/Snakefile

script:
    "scripts/Microarray/4_WGCNA/get_testing_set_microarray_module_scores.R"

SnakeMake From line 933 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_1/figure_1_version_6.R"

SnakeMake From line 944 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_1/figure1_module_tbnk_vp.R"

SnakeMake From line 954 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_1/figure1_condition_counts.R"

SnakeMake From line 963 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_1/figure_1_addendum_version_2.R"

SnakeMake From line 977 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_2/figure_2_version_2.R"

SnakeMake From line 999 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_2/figure_2a_version_4.R"

SnakeMake From line 1012 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_2/figure_2_n_subjects.R"

SnakeMake From line 1024 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_2/module_and_tbnk_boxplots_highlight.R"

SnakeMake From line 1040 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_2/IL23_box_and_corr_w_ifn_gamma.R"

SnakeMake From line 1050 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_2/IL23_corr_w_tbnk.R"

SnakeMake From line 1059 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_3/figure3_schematic.R"

SnakeMake From line 1065 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_3/3a_PCA_with_boxplots.R"

SnakeMake From line 1076 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_3/jpc1_healthy_vs_disease.R"

SnakeMake From line 1086 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_3/jive_pc_comparison.R"

SnakeMake From line 1096 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_3/joint_tbnk_module_cor.R"

SnakeMake From line 1108 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_3/3b_jive_var_expl.R"

SnakeMake From line 1117 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_4/figure_4_version_6.R"

SnakeMake From line 1134 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_5/figure_5_version_3.R"

SnakeMake From line 1148 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_5/il6_ihm_cor_ferrucci_data.R"

SnakeMake From line 1157 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_5/il6_ihm_cor.R"

SnakeMake From line 1168 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_5/cxcl9_ihm_cor.R"

SnakeMake From line 1178 of main/Snakefile

script:
    "scripts/Paper_Figures/Supplemental_Figure_1/supplemental_figure_1_version_3.R"

SnakeMake From line 1199 of main/Snakefile

script:
    "scripts/Paper_Figures/Supplemental_Figure_1/tbnk_module_cor.R"

SnakeMake From line 1209 of main/Snakefile

script:
    "scripts/Paper_Figures/Supplemental_Figure_2/tbnk_heatmap.R"

SnakeMake From line 1218 of main/Snakefile

script:
    "scripts/Paper_Figures/Supplemental_Figure_2/supplemental_figure_2_other_classifiers.R"

SnakeMake From line 1247 of main/Snakefile

script:
    "scripts/Paper_Figures/Supplemental_Figure_3/array_individual_scatter_and_boxplots.R"

SnakeMake From line 1257 of main/Snakefile

script:
    "scripts/Paper_Figures/Supplemental_Figure_3/soma_individual_scatter_and_boxplots.R"

SnakeMake From line 1267 of main/Snakefile

script:
    "scripts/Paper_Figures/Supplemental_Figure_3/cor_PC2_leuko_composite.R"

SnakeMake From line 1279 of main/Snakefile

script:
    "scripts/Paper_Figures/Supplemental_Figure_3/jive_pca_enrich.R"

SnakeMake From line 1287 of main/Snakefile

script:
    "scripts/Paper_Figures/Supplemental_Figure_3/cgd_jpc1.R"

SnakeMake From line 1296 of main/Snakefile

script:
    "scripts/Paper_Figures/Supplemental_Figure_4/supplemental_figure_4_version_5.R"

SnakeMake From line 1327 of main/Snakefile

script:
    "scripts/Paper_Figures/Supplemental_Figure_4/supplemental_figure_4_addendum_version_1.R"

SnakeMake From line 1346 of main/Snakefile

script:
    "scripts/Paper_Figures/Supplemental_Figure_5/supplemental_figure_5_version_4.R"

SnakeMake From line 1362 of main/Snakefile

script:
    "scripts/Paper_Figures/Supplemental_Figure_5/no_pm2_model_age_cor.R"

SnakeMake From line 1371 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_1_Statistics/figure_1_stats_version_1.Rmd"

SnakeMake From line 1385 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_2_Statistics/supplemental_figure_2_other_classifiers_table.Rmd"

SnakeMake From line 1400 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_2_Statistics/cgd_stat1_ifn.R"

SnakeMake From line 1410 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_4_Statistics/figure_4_stats_version_1.Rmd"

SnakeMake From line 1419 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_4_Statistics/n_features_used_classifiers.Rmd"

SnakeMake From line 1427 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_5_Statistics/figure_5_stats_version_1.Rmd"

SnakeMake From line 1437 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_1_Tables/figure_1_tables_version_1.R"

SnakeMake From line 1451 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_1_Tables/figure_1_tables_module_members.R"

SnakeMake From line 1467 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_2_Tables/figure_2_tables_version_2.R"

SnakeMake From line 1486 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_3_Tables/jpc_enrichments.R"

SnakeMake From line 1495 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_3_Tables/jpc_scores.R"

SnakeMake From line 1504 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_3_Tables/jive_pc_feat_cor.R"

SnakeMake From line 1514 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_4_Tables/figure_4_tables_version_2.R"

SnakeMake From line 1524 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_4_Tables/save_hi.R"

SnakeMake From line 1533 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_4_Tables/save_meta_analysis_results.R"

SnakeMake From line 1542 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_4_Tables/save_surrogate_signature_genes.R"

SnakeMake From line 1550 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_4_Tables/meta_analysis_n_subj_per_study.R"

SnakeMake From line 1559 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_5_Tables/figure_5_tables_version_1.R"

SnakeMake From line 1567 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_5_Tables/proteomic_hi_surrogate.R"

SnakeMake From line 1576 of main/Snakefile

script:
    "scripts/Paper_Figures/Figure_5_Tables/cxcl9_regress_age_ihm.R"

SnakeMake From line 1588 of main/Snakefile

script:
    "scripts/Paper_Figures/compile_all_supp_tables_excel.R"

SnakeMake From line 1626 of main/Snakefile

Multiomics Integration of Immune-Mediated Monogenic Diseases

Help improve this workflow!

monogenic-immune-health

Multiomics integration of 22 immune-mediated monogenic diseases reveals an emergent axis of immune health in humans

Input Data

Instructions

Terms of Use

Code Snippets

Comments

Support

Free

Related Workflows

public

public

public

public

public

public