Code Snippets

Processing sorted cells from BC1 that have been bulkrnaseq'd

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(tximport)
library(DESeq2)
library(readr)
library(stringr)

res_dirs <- snakemake@input[['cts']]
tx2g_fp <- snakemake@input[['tx2g']]
samples_fp <- snakemake@input[['samples']]
quant_program <- snakemake@params[['aligner']]

design_formula <- snakemake@params[['formula']]

if (snakemake@threads > 1) {
    library("BiocParallel")
    parallel <- TRUE
    register(MulticoreParam(snakemake@threads))
} else {
    parallel <- FALSE
}

if (quant_program == 'kallisto') {
    files <- file.path(res_dirs, "abundance.h5")
} else {
    files <- file.path(res_dirs, "quant.sf")
}
names(files) <- basename(dirname(files))
tx2g <- read_table2(tx2g_fp, col_names = c("tx", "ensgene", "symbol"))
txi <- tximport(files, type = quant_program, txOut = FALSE, tx2g = tx2g[, 1:2])
# txi <- tximport(files, type = "kallisto", txOut = FALSE, tx2g = tx2g[, 1:2])

samples <- read.csv(samples_fp)
print("First")
print(samples)
samples$id <- paste(samples$patient, "-", samples$condition, sep = "")
print("Second")
print(samples)
# Reorder rows so they match files order
samples <- samples[match(names(files), samples$id),]
print("Third")
print(files)
print(samples)

## Ensure factor ordering based on config specifications
vars <- snakemake@params[['levels']]
var_levels <- str_split(vars, ';', simplify=T)
for (var in var_levels) {
    print(paste("Variable:", var))
    s <- str_split(var, '=|,', simplify=T)
    col <- s[1, 1]
    level_order = s[1, 2:dim(s)[2]]
    samples[, col] <- factor(samples[, col], level_order)
}

f <- as.formula(design_formula)
ddsTxi <- DESeqDataSetFromTximport(txi, colData = samples, design = f)

keep <- rowSums(counts(ddsTxi)) >= 1
ddsTxi <- ddsTxi[keep, ]

dds <- DESeq(ddsTxi, parallel=parallel)
print(dds)

vst_cts <- vst(dds, blind=FALSE)
print(vst_cts)

saveRDS(dds, file=snakemake@output[['deseq']])
saveRDS(vst_cts, file=snakemake@output[['cts']])

R stringr readr kallisto BiocParallel tximport From line 1 of scripts/deseq2.R

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(DESeq2)
library(IHW)
library(ggplot2)
library(dplyr)
library(stringr)

if (snakemake@threads > 1) {
    library("BiocParallel")
    parallel <- TRUE
    register(MulticoreParam(snakemake@threads))
} else {
    parallel <- FALSE
}

t2g_fp <- snakemake@input[[2]]
t2g <- readr::read_table2(t2g_fp, col_names = c("tx", "ensgene", "symbol"))
t2g <- t2g %>%
    dplyr::distinct(ensgene, symbol)

dds <- readRDS(snakemake@input[[1]])

print(dds)

# Grab last variable at end of formula for contrasts
design_formula <- snakemake@params[['formula']]
s <- str_remove_all(design_formula, " |~")
s <- str_split(s, "\\+")
vars <- s[[1]]
var <- vars[length(vars)]

print(snakemake@params[['contrast']])
print("Creating results for the following variable:")
print(var)

print(resultsNames(dds))

contrast_coef <- paste(c(var, snakemake@params[['contrast']][1], "vs", snakemake@params[['contrast']][2]), collapse="_")
de_contrast <- c(var, snakemake@params[['contrast']][1], snakemake@params[['contrast']][2])

mle_res <- results(dds, contrast=de_contrast, filterFun=ihw, alpha = .05, parallel = parallel)
map_res <- lfcShrink(dds, coef=contrast_coef, type = "apeglm", parallel = parallel)

print("MLE LFC:")
print(mle_res)
print(summary(mle_res))

print("MAP LFC:")
print(map_res)
print(summary(map_res))

mle_df <- mle_res %>%
  data.frame() %>%
  tibble::rownames_to_column(var = "ensgene") %>%
  as_tibble() %>%
  left_join(t2g) %>%
  dplyr::select(symbol, ensgene, everything()) %>%
  dplyr::arrange(padj) %>%
  dplyr::distinct(symbol, .keep_all = TRUE)

map_df <- map_res %>%
    data.frame() %>%
    tibble::rownames_to_column(var = "ensgene") %>%
    as_tibble() %>%
    left_join(t2g) %>%
    dplyr::select(symbol, ensgene, everything()) %>%
    dplyr::arrange(padj) %>%
    dplyr::distinct(symbol, .keep_all = TRUE)

print("MLE dataframe")
print(mle_df)
print("MAP dataframe")
print(map_df)


mleplot <- mle_df %>%
    dplyr::mutate(
        significant = ifelse(padj < .05, "padj < .05", "padj >= .05"),
        direction = ifelse(log2FoldChange > 0, "Upregulated", "Downregulated")
    ) %>%
    ggplot(aes(log10(baseMean),log2FoldChange)) +
    geom_point(aes(color = significant, shape = direction)) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
    labs(x = "log10 Expression", y = "MLE Log2FoldChange") +
    theme_bw()

mapplot <- map_df %>%
    dplyr::mutate(
        significant = ifelse(padj < .05, "padj < .05", "padj >= .05"),
        direction = ifelse(log2FoldChange > 0, "Upregulated", "Downregulated")
    ) %>%
    ggplot(aes(log10(baseMean),log2FoldChange)) +
    geom_point(aes(color = significant, shape = direction)) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
    labs(x = "log10 Expression", y = "MAP Log2FoldChange") +
    theme_bw()

readr::write_csv(mle_df, snakemake@output[['mleres']])
readr::write_csv(map_df, snakemake@output[['mapres']])
ggsave(snakemake@output[['mlema']], plot = mleplot, width = 10, height = 7)
ggsave(snakemake@output[['mapma']], plot = mapplot, width = 10, height = 7)

R ggplot2 dplyr stringr BiocParallel IHW From line 1 of scripts/diffexp.R

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(DESeq2)
library(ggplot2)
library(stringr)

vsd <- readRDS(snakemake@input[[1]])

print(vsd)

label_vars <- str_split(snakemake@params[['label_vars']], ',', simplify=T)[1, ]

pcaplot <- plotPCA(vsd, intgroup=label_vars)

ggsave(snakemake@output[[1]], pcaplot)

R ggplot2 stringr PCAtools From line 1 of scripts/plot_pca.R

BIOF501 Term Project: Pipeline for Pseudobulking and Function Prediction

print("RUNNING EGAD")



packages <-installed.packages()[,"Package"]

if (!"EGAD" %in% packages) {
    BiocManager::install("EGAD")
    }

library(EGAD)
library(tidyverse)
library(stringr)
#data_file <-snakemake@input[[1]]

#Change
save <- "~/Masters/Pseudobulk_Function_Pipeline_HighRes/data/EAGD/EGAD_sum_pc_OPfiltered.csv"
# data_file <- "~/Masters/Pseudobulk_Function_Pipeline_HighRes/data/pseudobulk/sum_pseudobulk.csv"
#data_file <- "~/Masters/Pseudobulk_Function_Pipeline_HighRes/data/bulk/bulk_pc.csv"

is_bulk <- TRUE

#Never change
pc_genes <- "~/Masters/Pseudobulk_Function_Pipeline_HighRes/data/pc_genes/processed_uniprot.csv"
sample_names <- "~/Masters/Pseudobulk_Function_Pipeline_HighRes/data/sample_names/bulk_pseudo.csv"
shared_genes <- "~/Masters/Pseudobulk_Function_Pipeline_HighRes/data/pc_genes/scAndBulkOverlapGenes.txt"

#../../../../pipeline42/datasets/Gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct



################ 2.1 : MAKING DATA SETS

########### MAKE COEXPRESSION NETWORK

#~~~~~~~~~ If using file
#expression_data <- t(read.delim('data/LiverAverageCounts.csv', sep = ',', header= TRUE, row.names = 1))


#~~~~~~~~ Diagnostic WD

#getwd()

#~~~~~~~~~If using command line

#args = commandArgs(trailingOnly=TRUE)
#data_file <- args[1]


print("Loading Expression Dataset")

expression_data <- (read.delim(data_file, sep = ',', header= TRUE, row.names = 1))
sample_names_df <- read.delim(sample_names, sep = ",", row.names = 1, header = TRUE)




<<<<<<< HEAD
### Remove Organism Parts that are NOT shared between the two Tabula And Gtex Datasets
filter_by_OP <- function(OP, expression_data) {
  expression2 <- expression_data %>%
    filter(str_detect(rownames(expression_data), paste0(OP,".*")))
  return (expression2)
=======
coexpression_network[is.na(coexpression_network)] <- 0
>>>>>>> 3c1fb065b25c7ca5703de1a68e8cd379c6c7289b

}
if (is_bulk) {
  print('Subsampling for Bulk name Organism Parts')
  OP_names <- sample_names_df$bulk_names
  expression_data <- expression_data %>%
    filter(rownames(expression_data) %in% OP_names)
} else {
  print('Subsampling for Pseudobulk name Organism Parts')

  OP_names <- sample_names_df$pseudo_names


  expression_data_2 <- lapply(OP_names, filter_by_OP, expression_data)

  expression_data <- do.call(rbind, expression_data_2)
  }


print("Filtering Expression Data for PC genes")
pc <- read.delim(pc_genes, sep = ",", header = TRUE, row.names = 1)
pc_names <- pc$FirstUniprot
pc_expression <- expression_data[,colnames(expression_data) %in% pc_names]
print(paste("Removed",ncol(expression_data)- ncol(pc_expression) , "non-protein coding genes"))




######### Build Coexpression Network

coexpression_network <- cor(pc_expression)
coexpression_network[is.na(coexpression_network)] <- 0


############ BUILDING ANNOTATION SET
print("Building Annotation Set")

### With builtin GO
#annotations <- make_annotations(GO.human[,c('GO', 'evidence')], unique(GO.human$GO), unique(GO.human$evidence))

### With Custom GO with BP
GO <- read.delim(file = '~/Masters/Pseudobulk_Function_Pipeline_HighRes/data/GO/pro_GO.csv', sep = ",", stringsAsFactors = TRUE)

# Filter the GO to Gene pairings for only Genes measured in our expression data because we only want GO terms with 20>= genes
expression_genes <- colnames(expression_data)
GO <- filter(GO, GO$DB_Object_Symbol %in% expression_genes) 
#in our sc data this removes ~3,000 genes
#in bulk this removes ~16000 genes sheesh

# Filter for only the genes that are measured in both data types (note this is redudant with the prev step now)
sharedGenes <- read_csv(file =shared_genes, col_names = FALSE)
GO <- filter(GO, GO$DB_Object_Symbol %in%sharedGenes$X1)

GO_unique <- data.frame(table(GO$GO.ID))
colnames(GO_unique) <- c('GO', 'count')


# Create a histogram looking at how many genes are affiliated with each GO term
ggplot(data = GO_unique)+ 
  geom_histogram(mapping = aes(count)) +
  scale_y_continuous(trans = 'log10') +
  labs(title = 'Distribution of Genes in GO Terms') + 
  xlab('Number of Genes')+
  ylab('GO Terms with number of genes')


################ Remove GO Terms with less than 20 Genes in the expression data.

GO_unique_filtered <-  filter(GO_unique, count >=20)

1-nrow(GO_unique_filtered)/nrow(GO_unique) #92.8 % of GO terms were removed in sc. 92% in bulk

ggplot(data = GO_unique_filtered)+ 
  geom_histogram(mapping = aes(count), breaks = c(0, 19, 30, 40, 50, 60, 70, 80, 90, 100, 150)) + 
  scale_y_continuous(trans = 'log10') +
  labs(title = 'Distribution of Genes Assosiated with GO Terms') + 
  xlab('Number of Genes')+
  ylab('Count of GO Terms')

# With GO_unique_filtered, we now have all of the GO Terms we want to use in our analysis

GO_20_or_more <-dplyr::filter(GO, (GO$GO.ID %in% GO_unique_filtered$GO))

# 50694 GO To Gene assosiations were filtered out
1-nrow(GO_20_or_more)/nrow(GO) #44.9% of Gene to Go Term assosiations were for GO terms with less than 20 genes.  45% in bulk

#Note: We removed 86.2% of GO terms, this onyl removed 31.5% of GO to gene assosiations


#Make one hot encoding matrix
# Contains only GO terms with 20 genes or more that were measured in both datasets
annotations <- make_annotations(GO_20_or_more[,c('DB_Object_Symbol', 'GO.ID')],  unique(GO_20_or_more$DB_Object_Symbol), unique(GO_20_or_more$GO.ID))

################ Neighbor Voting
print("Performing Neighbor Voting. This can take a while")
auroc <- neighbor_voting(genes.labels = annotations, 
                          network = coexpression_network,
                          nFold = 3,
                          output = "AUROC")


#auroc <- run_GBA(network = coexpression_network,
#                labels = annotations)

rm(coexpression_network )
rm(annotations)


print(paste0("Wrote AURUC to ", save ))

write.table(x = auroc, paste0(save), sep = ",")

R ggplot2 tidyverse stringr EGAD From line 1 of bin/EGAD.R

RADSeq tool with Snakemake workflow integration for analysis of RAD sequencing data. (latest)

library("tidyverse")
library("stringr")

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

blast <- snakemake@input[[1]]
blast_data <- read_tsv(blast, col_names = TRUE, trim_ws = TRUE)
names(blast_data) <- c('res_id','sim_id','identity', 'len_alignment', 'mismatches', 'gap_opens', 'q_start', 'q_end', 's_start', 's_end', 'evalue', 'bit_score')

blast_data <- blast_data %>%
  mutate(res_id=as.character(res_id)) %>%
  separate(res_id, c("sample", "individual", "locus"), sep = "\\|", extra="merge", convert=TRUE) #%>%

for(i in blast_data$locus) {
  if(nchar(strsplit(i, "-")[[1]][1])<5){
    blast_data$locus[blast_data$locus==i] <- str_replace(blast_data$locus[blast_data$locus==i], 'LOC', 'LOC0')
  }
}

plot <- ggplot(data = blast_data, aes(y=locus, x=identity)) +
  geom_bar(width = 1.0, position = "dodge", stat="identity", aes(fill = identity), colour="Black") +
  scale_fill_gradient(low="mediumpurple3",high="green3", name = "Identity") +
  ggtitle("Indentity [%] of loci identified by NodeRAD vs. simulated loci") +
  xlab("Identity [%]") +
  ylab("Locus") +
  theme_minimal() +
  theme(aspect.ratio = 2.5/1.5, plot.title = element_text(hjust = 0.5), legend.position = "right", legend.key.size = unit(0.8, "cm"), axis.text.y = element_text(hjust = 0))
plot
ggsave(snakemake@output[["ident"]], width = 7, height = 7)

blast_data$intervals <- cut(blast_data$identity, seq(0,100,by=1))
plot <- ggplot(blast_data, aes(intervals)) +
    geom_histogram(stat= "count", aes(fill = ..count..)) +
    xlab("Identity [%]") +
    ylab("Counts") +
    scale_fill_gradient(name = "Counts")+
    theme(aspect.ratio = 2.5/1.5, plot.title = element_text(hjust = 0.5), legend.position = "right") +
    ggtitle("Histogram of number of alleles identified by NodeRAD\nwith respect to their similarity to simulated data.")
plot
ggsave(snakemake@output[["ident_hist"]], width = 7, height = 7)

plot <- ggplot(data = blast_data, aes(x=locus, y=bit_score, group = individual)) +
  geom_line(color = "gray70") +
  geom_point(aes(color = bit_score), size =3) +
  geom_point(shape = 1,size = 3, colour = "black") +
  scale_color_gradient(low="red", high="green", name = "Bit score") +
  ggtitle("Bitscores of loci identified by NodeRAD vs. simulated loci") +
  xlab("Locus") +
  ylab("Bit score") +
  theme_minimal() +
  theme(axis.text.x = element_text(color = "black", size = 7, angle = 90, hjust = 0, face = "plain"), plot.title = element_text(hjust = 0.5), legend.position = "right", legend.key.size = unit(0.4, "cm"), axis.text.y = element_text(hjust = 0))
plot
ggsave(snakemake@output[["bit_scores"]], width = 7, height = 7)

plot <- ggplot(data = blast_data, aes(x=locus, y=evalue, group = individual)) +
  geom_line(color = "gray70") +
  geom_point(aes(color = evalue), size =3) +
  geom_point(shape = 1,size = 3, colour = "black") +
  scale_color_gradient(low="green", high="red", name = "E-value") +
  ggtitle("E-Values of loci identified by NodeRAD vs. simulated loci") +
  xlab("Locus") +
  ylab("E-Value") +
  theme_minimal() +
  theme(axis.text.x = element_text(color = "black", size = 7, angle = 90, hjust = 0, face = "plain"), plot.title = element_text(hjust = 0.5), legend.position = "right", legend.key.size = unit(0.4, "cm"), axis.text.y = element_text(hjust = 0))
plot
ggsave(snakemake@output[["evalues"]], width = 7, height = 7)

R ggplot2 tidyverse BLAST stringr From line 1 of scripts/plots_blast.R

Improving pathogenicity prediction of missense variants by using AlphaFold-derived features (0.9)

library(readr)
library(dplyr)
library(optparse)
library(stringr)
source("workflow/scripts/existing_scores_glm_functions.R")
set.seed(1)

option_list = list(
  make_option(c("-t", "--training_dataset"), type="character", default="results/prediction_final/pre_final_model_regular_variants.csv.gz", 
              help="File containing the training dataset"),
  make_option(c("-c", "--combined_model"), type="character", default="results/prediction_final/combined_model.RData", 
              help="File where the final combined logistic regression model should be saved to"),
  make_option(c("-i", "--training_var_ids"), type="character", default="results/prediction_final/training_var_ids.tsv", 
              help="IDs of variants that were in the training / test set")
)

opt = parse_args(OptionParser(option_list=option_list))

variants<-read_csv(opt$training_dataset, na=c(".","NA", NA))
variants$AlphScore<-variants$predicted_Alph

index_col<-get_index_col(variants)

variants$DEOGEN2_score_med<- unlist_score(variants$DEOGEN2_score, index_col)

gnomad_vars<-variants %>% 
  filter(gnomadSet==1)

clinvar_vars_not_gnomad <-variants%>% 
  filter(gnomadSet==0) %>%
  filter(! var_id_genomic %in% gnomad_vars$var_id_genomic )

set_of_models<-fit_set_of_models(clinvar_vars_not_gnomad)

gnomad_set_position<-gnomad_vars$var_id_genomic
clinvar_set_position<-clinvar_vars_not_gnomad$var_id_genomic

training_vars<-rbind(
tibble(var_id_genomic=gnomad_set_position,
       gnomadSet=1),
tibble(var_id_genomic=clinvar_set_position,
       gnomadSet=0)
)
# write to disk
write_tsv(x=training_vars,
          file=opt$training_var_ids)

write_rds(x=set_of_models,
          file=opt$combined_model)

R dplyr stringr readr optparse From line 1 of scripts/create_final_combined_models.R

library(readr)
library(dplyr)
library(stringr)
library(optparse)
library(ranger)
source("workflow/scripts/existing_scores_glm_functions.R")
set.seed(1)

option_list = list(
  make_option(c("-c", "--csv_location"), type="character", default="results/combine2_protein/A0A1B0GUS4_w_AFfeatures.csv.gz", 
              help="csv.gz file"),
  make_option(c("-m", "--model_location"), type="character", default="results/prediction/final_written_full_model.RData", 
              help="File containing the full model for prediction"),
  make_option(c("-u", "--use_cols_file"), type="character", default="results/prediction/final_colnames_to_use.RData", 
              help="File containing the colnames to use in the final model"),
  make_option(c("-t", "--toAS_properties"), type="character", default="results/prediction/final_toAS_properties.RData", 
              help="File containing the properties of the alternative amino acids"),
  make_option(c("-b", "--combined_model"), type="character", default="results/prediction_final/combined_model.RData", 
              help="File where the final combined logistic regression model should be saved to"),
  make_option(c("-v", "--training_var_ids"), type="character", default="results/prediction_final/training_var_ids.tsv", 
              help="IDs of variants that were in the training / test set"),
  make_option(c("-o", "--output_file"), type="character", default="results/prediction/predicted.csv.gz", 
              help="Excel file listing columns to use"),
  make_option(c("-r", "--reduced"), type="logical", default="FALSE", 
              help="just save essential columns")
)

opt = parse_args(OptionParser(option_list=option_list))

to_AS_table<-read_tsv("resources/to_AS_table.txt")
variants<-read_csv(opt$csv_location, na=c(".","NA", NA), col_types = cols(.default = "?", REVEL_score = "d"))

# if variant file is empty, stop
if (nrow(variants)==0){
  system(paste("touch", opt$output_file))
}else{

model_to_use<-readRDS(opt$model_location)
colnames_to_use<-readRDS(opt$use_cols_file)
toAS_properties<-readRDS(opt$toAS_properties)

set_of_models<-readRDS(opt$combined_model)

training_var_ids<-read_tsv(opt$training_var_ids)
gnomad_var_ids<-(training_var_ids %>% filter(gnomadSet==1))$var_id_genomic
ClinVar_var_ids<-(training_var_ids %>% filter(gnomadSet==0))$var_id_genomic

# function to add to AS properties
addToAS<-function(variants_par, toAsProp, colToUse){
  variants_mod<-variants
  variants_mod<-variants_mod%>%
    left_join(toAsProp, by=c("from_AS"="from_AS_toAS"))

  colnames_toAS<-colnames(toAsProp)
  colnames_toAS<-colnames_toAS[colnames_toAS!="from_AS_toAS"]

  sel_vars_to<-str_replace(colnames_toAS, fixed("_toAS"),"")

  variants_mod[, colnames_toAS]<-variants_mod[, sel_vars_to] - variants_mod[, colnames_toAS]
  colToUse_mod<-colToUse[colToUse!="outcome"]

  return(variants_mod[, colToUse_mod])
}


# external function that is also used by preprocess
variants<-prepareVariantsForPrediction(variants, to_AS_table)

#add to AS properties
variants_alph<-addToAS(variants, toAS_properties, colnames_to_use)

# just keep complete cases
variants<-variants[complete.cases(variants_alph),]
variants_alph<-variants_alph[complete.cases(variants_alph),]

# predict AlphScore
variants$AlphScore<-predict(model_to_use, variants_alph)$predictions
rm(variants_alph)

index_col<-get_index_col(variants)

variants$DEOGEN2_score_med<- unlist_score(variants$DEOGEN2_score, index_col)

variants<-predict_set_of_models(set_of_models = set_of_models, variants_to_predict = variants)
colNamesCombinedModels<-unlist(set_of_models[2])

# flag variants that are in the training data set or that are in dbNSFP ClinVar
variants<-variants %>% 
  mutate(in_gnomad_train=var_id_genomic %in% gnomad_var_ids)%>%
  mutate(in_clinvar_ds=var_id_genomic %in% ClinVar_var_ids)


# if wanted, reduce the output to have smaller files
if (opt$reduced==TRUE){
 variants<-variants %>% 
  mutate(ID=paste(`#chr`, `pos(1-based)`, ref, alt, sep=":"))%>%
  select(any_of(colnames(variants)[2:10]), 
         ID, genename, Uniprot_acc_split,Uniprot_acc,HGVSp_VEP_split, HGVSp_VEP, CADD_raw, REVEL_score, DEOGEN2_score, 
         b_factor, SOLVENT_ACCESSIBILITY_core, in_gnomad_train, in_clinvar_ds, AlphScore, any_of(colNamesCombinedModels))
}

# write to disk
write_csv(x=variants,
          file=opt$output_file)
}

R dplyr stringr readr optparse ranger From line 1 of scripts/predict_w_ranger_model_Final.R

library(readr)
library(dplyr)
library(stringr)
library(optparse)
library(ranger)
source("workflow/scripts/existing_scores_glm_functions.R")
set.seed(1)

option_list = list(
  make_option(c("-c", "--csv_location"), type="character", default="results/combine2_protein/P38398_w_AFfeatures.csv.gz", 
              help="csv.gz file"),
  make_option(c("-m", "--model_location"), type="character", default="results/prediction/final_written_full_model.RData", 
              help="File containing the full model for prediction"),
  make_option(c("-u", "--use_cols_file"), type="character", default="results/prediction/final_colnames_to_use.RData", 
              help="File containing the colnames to use in the final model"),
  make_option(c("-t", "--toAS_properties"), type="character", default="results/prediction/final_toAS_properties.RData", 
              help="File containing the properties of the alternative amino acids"),
  make_option(c("-n", "--model_location_null"), type="character", default="results/prediction/final_written_full_model.RData", 
              help="File containing the full model for prediction, null model"),
  make_option(c("-x", "--use_cols_file_null"), type="character", default="results/prediction/final_colnames_to_use.RData", 
              help="File containing the colnames to use in the final model, null model"),
  make_option(c("-v", "--toAS_properties_null"), type="character", default="results/prediction/final_toAS_properties.RData", 
              help="File containing the properties of the alternative amino acids, null model"),
  make_option(c("-r", "--reduced"), type="logical", default="FALSE", 
              help="just save essential columns"),
  make_option(c("-o", "--output_file"), type="character", default="results/prediction/predicted.csv.gz", 
              help="Excel file listing columns to use")
)

opt = parse_args(OptionParser(option_list=option_list))

to_AS_table<-read_tsv("resources/to_AS_table.txt")
variants<-read_csv(opt$csv_location, na=c(".","NA", NA))

# if variant file is empty, stop
if (nrow(variants)==0){
  system(paste("touch", opt$output_file))
}else{

model_to_use<-readRDS(opt$model_location)
colnames_to_use<-readRDS(opt$use_cols_file)
toAS_properties<-readRDS(opt$toAS_properties)

model_to_use_null<-readRDS(opt$model_location_null)
colnames_to_use_null<-readRDS(opt$use_cols_file_null)
toAS_properties_null<-readRDS(opt$toAS_properties_null)

# function to add to AS properties
addToAS<-function(variants_par, toAsProp, colToUse){
  variants_mod<-variants
  variants_mod<-variants_mod%>%
    left_join(toAsProp, by=c("from_AS"="from_AS_toAS"))

  colnames_toAS<-colnames(toAsProp)
  colnames_toAS<-colnames_toAS[colnames_toAS!="from_AS_toAS"]

  sel_vars_to<-str_replace(colnames_toAS, fixed("_toAS"),"")

  variants_mod[, colnames_toAS]<-variants_mod[, sel_vars_to] - variants_mod[, colnames_toAS]
  colToUse_mod<-colToUse[colToUse!="outcome"]

  return(variants_mod[, colToUse_mod])
}

# external function that is also used by preprocess
variants<-prepareVariantsForPrediction(variants, to_AS_table)

#add to AS properties
variants_alph<-addToAS(variants, toAS_properties, colnames_to_use)

# just keep complete cases
variants<-variants[complete.cases(variants_alph),]
variants_alph<-variants_alph[complete.cases(variants_alph),]

# predict AlphScore
variants$AlphScore<-predict(model_to_use, variants_alph)$predictions
rm(variants_alph)

# predict null model
variants_alph_null<-addToAS(variants, toAS_properties_null, colnames_to_use_null)
variants$Alph_null<-predict(model_to_use_null, variants_alph_null)$predictions

# if wanted, reduce the output to have smaller files
if (opt$reduced==TRUE){
 variants<-variants %>% 
  mutate(ID=paste(`#chr`, `pos(1-based)`, ref, alt, sep=":"))%>%
  select(any_of(colnames(variants)[2:10]), 
         ID, genename, Uniprot_acc_split,Uniprot_acc,HGVSp_VEP_split, HGVSp_VEP, CADD_raw, REVEL_score, DEOGEN2_score, 
         b_factor, SOLVENT_ACCESSIBILITY_core, Alph_null, AlphScore)
}

# write to disk
write_csv(x=variants,
          file=opt$output_file)
}

R dplyr stringr readr optparse ranger From line 1 of scripts/predict_w_ranger_model_for_Evaluation.R

Gezelvirus Workflow: Studying a Polinton-like Virus in Phaeocystis globosa

suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(stringr))

stdin_fd <- file("stdin")
all.lines <- readLines(stdin_fd)
close(stdin_fd)
param.start <- 1
data.start  <- which(grepl("^ *No Hit", all.lines)) %>% first %>% `+`(1)

align.start <- which(grepl("^No 1", all.lines)) %>% first

param.end <- data.start - 2
data.end  <- align.start - 1
align.end <- length(all.lines)

if (is.na(align.start)) {
    data <- tibble(
        Query           = character(),
        No              =   integer(),
        Hit.ID          = character(),
        Hit.Description = character(),
        Q.ss_pred       = character(),
        Q.query         = character(),
        Q.consensus     = character(),
        Q.Start         =   integer(),
        Q.End           =   integer(),
        Q.Length        =   integer(),
        T.consensus     = character(),
        T.Start         =   integer(),
        T.End           =   integer(),
        T.Length        =   integer(),
        T.hit           = character(),
        T.ss_dssp       = character(),
        T.ss_pred       = character(),
        Aligned_cols    =   integer(),
        E.value         =   numeric(),
        Identities      =   numeric(),
        Probab          =   numeric(),
        Score           =   numeric(),
        Similarity      =   numeric(),
        Sum_probs       =   numeric(),
        Template_Neff   =   numeric()
    )
} else {
    metadata <- data.frame(key = all.lines[param.start:param.end]) %>%
        mutate(value = substr(key, 14, 10000) %>% trimws, key = substr(key, 1, 14) %>% trimws) %>%
        filter(key != "") %>%
        {setNames(.$value, .$key)} %>%
        as.list
    data <- data.frame(Query = sub(" .*", "", metadata$Query), line = all.lines[align.start:align.end], stringsAsFactors = F) %>%
        filter(line != "") %>%
        extract(line, into = c("name", "value"), regex = "([^ ]+) ?(.+)?", remove = F) %>%
        mutate(No = ifelse(name == "No", value, NA) %>% as.integer) %>%
        mutate(Hit.ID = ifelse(substr(name, 1, 1) == ">", substr(name, 2, nchar(.)), NA)) %>%
        mutate(Hit.Description = ifelse(substr(name, 1, 1) == ">", value, NA)) %>%
        mutate(Match = ifelse(grepl("=", name), line, NA)) %>%
        mutate(name = ifelse(grepl("Q Consensus", lag(line)) & grepl("T Consensus", lead(line)), "M", name)) %>%
        mutate(value = ifelse(name == "M", line, value)) %>%
        fill(No) %>%
        group_by(Query, No) %>%
        summarize(
            Hit.ID       = na.omit(Hit.ID) %>% first,
            Hit.Description = na.omit(Hit.Description) %>% first,
            Match        = na.omit(Match) %>% first,
            Q.ss_pred    = value[name == "Q" & grepl("^ss_pred ", value)]         %>% substr(., 16, nchar(.)) %>% paste(collapse = "") %>% gsub(" +", "", .),
            Q.query      = value[name == "Q" & grepl("^Consensus ", lead(value))] %>% substr(., 16, nchar(.)) %>% paste(collapse = " "),
            Q.consensus  = value[name == "Q" & grepl("^Consensus ", value)]       %>% substr(., 16, nchar(.)) %>% paste(collapse = " "),
            T.consensus  = value[name == "T" & grepl("^Consensus ", value)]       %>% substr(., 16, nchar(.)) %>% paste(collapse = " "),
            T.hit        = value[name == "T" & grepl("^Consensus ", lag(value))]  %>% substr(., 16, nchar(.)) %>% paste(collapse = " "),
            T.ss_dssp    = value[name == "T" & grepl("^ss_dssp ", value)]         %>% substr(., 16, nchar(.)) %>% paste(collapse = " ") %>% gsub(" +", "", .),
            T.ss_pred    = value[name == "T" & grepl("^ss_pred ", value)]         %>% substr(., 16, nchar(.)) %>% paste(collapse = "")  %>% gsub(" ", "", .),
            .groups = "drop"
        ) %>%
        extract(Q.consensus, into = c("Q.Start", "Q.End", "Q.Length"), regex = "^ *(\\d+) .+ (\\d+) +[(](\\d+)[)]$", remove = F, convert = T) %>%
        extract(T.consensus, into = c("T.Start", "T.End", "T.Length"), regex = "^ *(\\d+) .+ (\\d+) +[(](\\d+)[)]$", remove = F, convert = T) %>%
        mutate(
            Q.consensus  = gsub("[0-9() ]+", "", Q.consensus),
            Q.query      = gsub("[0-9() ]+", "", Q.query),
            T.consensus  = gsub("[0-9() ]+", "", T.consensus),
            T.hit        = gsub("[0-9() ]+", "", T.hit),
        ) %>%
        #extract(Hit.Description, into = "Hit.Organism",    regex = "[{]([^}]+)[}]",  remove = F) %>%
        #extract(Hit.Description, into = "Hit.Description", regex = "([^;]+)",        remove = F) %>%
        #extract(Hit.Description, into = "Hit.Keywords",    regex = "[^;]+; ([^;]+)", remove = F) %>%
        mutate(Match = str_split(Match, " +")) %>%
        unnest(cols = Match) %>%
        separate(Match, into = c("key", "value"), "=") %>%
        mutate(value = sub("%", "", value) %>% as.numeric) %>%
        spread(key, value) %>%
        rename(E.value = `E-value`) %>%
        mutate(Aligned_cols = as.integer(Aligned_cols))
}

write.table(data, quote = F, sep = "\t", row.names = F)

R dplyr tidyr stringr From line 4 of helpers/parse_hhsuite.R

library(dplyr)
library(tidyr)
library(ape)
library(ggtree)
library(treeio)
library(phangorn)
library(stringr)
library(ggplot2)
library(phytools)

if (interactive()) {
    setClass("snake", slots = list(input = "list", output = "list"))
    snakemake <- new("snake", input  = list(
            tree = "analysis/phylogeny/MCP_NCLDV_epa/epa_result.newick",
            fasta = "analysis/phylogeny/MCP_NCLDV.fasta",
            outgroups    = "metadata/queries/MCP_NCLDV_outgroups.faa",
            synonyms = "metadata/organisms.txt",
            hmm = Sys.glob("hmm_algae/*.hmm")
    ), output = list(
        image = "test.svg",
        jtree = "output/MCP_NCLDV.jtree"
    ))
}

with(snakemake@input, {
    tree_file     <<- tree
    fasta_file    <<- fasta
    synonyms_file <<- synonyms
    outgroup_file <<- outgroups
})
with(snakemake@output, {
    out_image_file <<- image
    out_jtree_file <<- jtree
})
with(snakemake@params, {
    outgroup_rooting <<- outgroup_rooting
})

read.fasta.headers <- function(fnames) {
    file.info(fnames) %>%
        filter(size > 0) %>%
        rownames %>%
        lapply(treeio::read.fasta) %>%
        lapply(names) %>%
        unlist %>%
        data.frame(title = .)
}

synonyms <- read.table(synonyms_file, header = T, sep = "\t", fill = T, na.strings = "") %>%
    mutate(Collapse = ifelse(is.na(Collapse), Name, Collapse))

headers <- read.fasta.headers(fasta_file) %>%
    extract(title, into = c("label", "ID"), regex = "^([^ ]+) ([^ ]+)", remove = F) %>%
    left_join(synonyms, by = "ID")

no_name <- filter(headers, is.na(Name)) %>%
    pull(label) %>%
    paste(collapse = ", ")
if (no_name != "") {
    print(paste("No aliases found for: ", no_name))
    quit(status = 1)
}

tree <- read.tree(tree_file)
tree <- phangorn::midpoint(tree, node.labels = "support")
if (outgroup_rooting) {
    outgroup_df <- read.fasta.headers(outgroup_file)
    outgroups <- with(outgroup_df, sub(" .*", "", title))
    tree <- ape::root(tree, node = MRCA(tree, outgroups), edgelabel = T, resolve.root = T)
}
tree <- as_tibble(tree) %>%
    mutate(support = ifelse(node %in% parent & label != "", label, NA)) %>%
    separate(support, into = c("SH_aLRT", "UFboot"), sep = "/", convert = T) %>%
    left_join(headers, by = "label") %>%
    mutate(label.show = Name) %>%
    mutate(isInternal = node %in% parent) %>%
    `class<-`(c("tbl_tree", "tbl_df", "tbl", "data.frame"))
tree_data <- as.treedata(tree)
write.jtree(tree_data, file = out_jtree_file)

ntaxa <- filter(tree, ! node %in% parent) %>% nrow

colors <- list(
    Haptophyta = "orange",
    Chlorophyta = "green",
    Streptophyta = "darkgreen",
    MAG = "purple",
    Stramenopiles = "brown",
    Cryptophyta = "red",
    Amoebozoa = "gold4",
    Euglenozoa = "yellow",
    Choanoflagellata = "darkslateblue",
    Glaucophyta = "cyan",
    Animals = "blue",
    Dinoflagellata = "gray50",
    Rhizaria = "gray30"
)

scaleClades <- function(p, df) {
    with(df, Reduce(function(.p, .node) {
        offs <- offspring(.p$data, .node)
        scale <- 0.5 / (nrow(offs) - 1)
        scaleClade(.p, .node, scale)
    }, node, p))
}
collapseClades <- function(p, df) {
    with(df, Reduce(function(.p, .node) {
        fill <- unlist(colors[Host[node == .node]])
        .p$data[.p$data$node == .node, "label.show"] <- label.show[node == .node]
        collapse(.p, .node, "mixed", fill = fill)
    }, node, p))
}
#labelClades <- function(p) {
#    with(df, Reduce(function(.p, .node) {
#        .p + geom_cladelab(node = .node, label = label[node == .node], align = T, offset = .2, textcolor = 'blue')
#    }, node, p))
#}

multi_species <- allDescendants(tree_data@phylo) %>%
    lapply(function(x) filter(tree, node %in% x)) %>%
    bind_rows(.id = "ancestor") %>%
    group_by(ancestor) %>%
    filter(n_distinct(Collapse, na.rm = T) == 1, sum(!isInternal) > 1) %>% # , !any(Group == "Haptophyta")) %>%
    ungroup %>%
    mutate(ancestor = as.numeric(ancestor)) %>%
    filter(! ancestor %in% node) %>%
    filter(!is.na(Collapse)) %>%
    group_by(ancestor, Collapse) %>%
    summarize(num_tips = sum(!isInternal), Host = first(na.omit(Host))) %>%
    mutate(label.show = sprintf("%s (%d)", Collapse, num_tips)) %>%
    rename(node = ancestor)
p <- ggtree(tree_data) +
    geom_nodepoint(aes(x = branch, subset = !is.na(UFboot) & UFboot >= 90, size = UFboot)) +
    geom_tiplab(aes(label = label.show), size = 4, align = T, linesize = 0) +
    geom_text2(aes(subset = node %in% multi_species$node, x = max(x, na.rm = T), label = label.show), nudge_x = 0.01, size = 4, hjust = 0) +
    geom_tippoint(aes(color = Host), size = 3) +
    geom_treescale(width = 0.5) +
    scale_size_continuous(limits = c(90, 100), range = c(1, 3)) +
    scale_shape_manual(values = seq(0,15)) +
    scale_color_manual(values = colors)

p <- scaleClades(p, multi_species)
p <- collapseClades(p, multi_species)
# p <- facet_plot(p, mapping = aes(x = as.numeric(as.factor(query.name)), shape = DESC), data = genes, geom = geom_point, panel = 'Genes')

ggsave(out_image_file, p, height = ntaxa * 0.1, width = 7, limitsize = F)

R Snakemake ggplot2 dplyr tidyr stringr APE ggtree treeio phangorn phytools From line 2 of scripts/ggtree.R

Workflow Steps and Code Snippets

Processing sorted cells from BC1 that have been bulkrnaseq'd

BIOF501 Term Project: Pipeline for Pseudobulking and Function Prediction

RADSeq tool with Snakemake workflow integration for analysis of RAD sequencing data. (latest)

Improving pathogenicity prediction of missense variants by using AlphaFold-derived features (0.9)

Gezelvirus Workflow: Studying a Polinton-like Virus in Phaeocystis globosa

stringr