Code Snippets

WES analysis pipeline

Sys.setenv("TZDIR"=paste0(Sys.getenv("CONDA_PREFIX"), "/share/zoneinfo"))

library(readr)
library(dplyr)
library(ggplot2)

df <- read_csv(snakemake@input[[1]])
df <- df %>%
    mutate(sample_type = factor(sample_type, levels = c("normal", "tumor")))
p <- df %>%
    ggplot(aes(sample_type, mean)) +
    geom_boxplot(aes(fill = sample_type)) +
    labs(x = "", y = "Average Depth") +
    guides(fill = guide_legend(title="")) +
    ggtitle("Sequencing Depths")
ggsave(snakemake@output[[1]], p)

R ggplot2 dplyr readr From line 1 of scripts/plot_depth.R

Finding cryptic relationships to boost disease gene detection (v0.3.0dev1-1)

library(ggplot2)
library(dplyr)
library(tribes.tools)

R Markdown ggplot2 dplyr From line 13 of notebooks/compare_vs_true_rel.Rmd

#make plot comparing estimated to true degree - excluding unrelated as it distorts scale
print(
    ggplot(estimatedVsTruth, aes(Degree, EstDegree)) 
        + geom_point(aes(size = n)) 
        + geom_text(aes(label=n),hjust=0.5, vjust=-1, colour='red') 
        + theme_bw()
        + labs(title="Predicted vs True with UR",
                        y="Estimated degree of relationship",
                        x="Degree of reported relationship")
)

R Markdown ggplot2 From line 56 of notebooks/compare_vs_true_rel.Rmd

#make plot comparing estimated to true degree - excluding unrelated as it distorts scale
print(
    ggplot(estimatedVsTruth %>% filter(Degree != "UR") , aes(Degree, EstDegree)) 
        + geom_point(aes(size = n)) 
        + geom_text(aes(label=n),hjust=0.5, vjust=-1, colour='red') 
        + theme_bw()
        + labs(title="Predicted vs True no UR",
                        y="Estimated degree of relationship",
                        x="Degree of reported relationship")
)

R Markdown ggplot2 From line 70 of notebooks/compare_vs_true_rel.Rmd

Pipeline to pull microbial reads from WGS data and perform metagenomic analysis (v1.1)

library(tidyverse)
library(MetBrewer)

#### Load Data ####
concatenated_alignments <- read_tsv(snakemake@input[["alignment_stats"]], col_names = c("SN", "sequence", "num_sequences", "file_samples"))
concatenated_alignments$file_samples[concatenated_alignments$file_samples == "# excluding supplementary and secondary reads"] <- ""

#### Get List of Sample Names ####
sample_names <- concatenated_alignments %>%
  separate(file_samples, c("file", "sample"), sep = "stats_ffq/") %>%
  separate(sample, c("sample", "junk"), sep = "_align") %>%
  select(sequence, num_sequences, sample) %>%
  group_by(sample) %>%
  summarise(n()) %>%
  select(sample) %>%
  filter(sample != "NA")

sample_names <- as.vector(sample_names$sample)

#### Clean Data ####
concatenated_alignments_clean <- concatenated_alignments %>%
  select(!SN) %>%
  separate(file_samples, c("file", "sample"), sep = "stats_ffq/") %>%
  separate(sample, c("sample", "junk"), sep = "_align") %>%
  select(sequence, num_sequences, sample) %>%
  mutate(sample = replace(sample, is.na(sample), sample_names))

##### Generate Summary Tables ######
# Wider, 'human readable' format with proportions
concatenated_alignments_clean_wide <- concatenated_alignments_clean %>% 
  group_by(sample) %>%
  pivot_wider(names_from = sequence, values_from = num_sequences) %>%
  mutate(proportion_mapped = `reads mapped:` / `raw total sequences:` * 100) 

write.table(concatenated_alignments_clean_wide, file = snakemake@output[["human_read_tbl"]], sep = "\t", row.names = FALSE)
write.table(concatenated_alignments_clean, file = snakemake@output[["long_read_tbl"]], sep = "\t", row.names = FALSE)

#### Plot Summaries ####
alignment_statistics_plot <- concatenated_alignments_clean %>%
  ggplot() + 
  aes(x = sample, y = num_sequences, fill = sequence) %>%
  geom_bar(stat = "identity", position = "dodge") + 
  theme_bw(base_size = 15) + 
  scale_fill_manual(values = met.brewer("Redon", n = 3, type = "discrete"), 
                    labels = c("Raw Read Number", "Reads Mapped", "Reads Unmapped")) + 
  xlab("Sample") + 
  ylab("Read Number") + 
  theme(legend.title = element_blank(), 
        axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1), 
        legend.position = "top") 

ggsave(snakemake@output[["alignment_stat_plot"]], alignment_statistics_plot)

R ggplot2 tidyverse MetBrewer From line 3 of scripts/combined_kraken_alignment_stat_ffq.R

library(tidyverse)

genus_delim <- function(){
  if(snakemake@params[["kraken_db_type"]]){
    return("g__")
  }else{
    return("g__g__")
  }
}


spp_delim <- function(){
  if(snakemake@params[["kraken_db_type"]]){
    return("s__")
  }else{
    return("s__s__")
  }
}

genus <- genus_delim()
spp <- spp_delim()

data <- read_tsv(snakemake@input[["combined_report"]])

##### Get Domain #####
kingdom_data <- data %>%
  filter(!grepl("p__|g__|s__|c__|o__|f__", `#Classification`)) %>%
  separate(`#Classification`, c("junk", "kingdom"), sep = "^k__") %>%
  select(!junk)

kingdom_data_long <- kingdom_data %>%
  pivot_longer(cols = !kingdom, names_to = "study", values_to = "read_number") 

##### Get genus  #####
genus_data <- data %>%
  filter(grepl("g__", `#Classification`)) %>%
  separate(`#Classification`, c("junk", "genus"), sep = genus) %>%
  select(!junk) %>%
  filter(!grepl("s__", genus)) 

genus_data_long <- genus_data %>%
  pivot_longer(cols = !genus, names_to = "study", values_to = "read_number") 

##### Get species  #####
species_data <- data %>% 
  filter(grepl("s__", `#Classification`)) %>%
  separate(`#Classification`, c("junk", "species"), sep = spp) %>%
  select(!junk)

species_data_long <- species_data %>% 
  pivot_longer(cols = !species, names_to = "study", values_to = "read_number") 

##### How many reads per study? #####
total_reads_per_study <- species_data_long %>% 
  group_by(study) %>%
  summarise(total_reads = sum(read_number)) 

classified_reads_plot <- total_reads_per_study %>% 
  ggplot() + 
  aes(x = study, y = total_reads) + 
  geom_col() + 
  theme_classic() + 
  coord_flip()

##### Species Classification Passed a specific threshold (unfinished) ##### 
species_data_filtered <- species_data_long %>%
  filter(read_number > 1000)

genus_data_filtered <- genus_data_long %>%
  filter(read_number > 1000)

#### Export Dataframes to TSV files ####
write.table(kingdom_data_long, file = snakemake@output[["kingdom_table"]], sep = "\t", row.names = FALSE)
write.table(genus_data_long, file = snakemake@output[["genus_table"]], sep = "\t", row.names = FALSE)
write.table(species_data_long, file = snakemake@output[["spp_table"]], sep = "\t", row.names = FALSE)
write.table(total_reads_per_study, file = snakemake@output[["classified_reads"]], sep = "\t", row.names = FALSE)

#### Export Classified Reads Plot ####
pdf(snakemake@output[["classified_reads_plot"]])
print(classified_reads_plot)
dev.off()

R ggplot2 tidyverse From line 2 of scripts/generate_kraken_summaries.R

library(tidyverse)
# Get colour pallette
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
# Set Read Filter
filter <- as.character(snakemake@params[["bracken_threshold"]])
filter <- as.numeric(filter)

#### Load Data #####
conc_data <- read_tsv(snakemake@input[["conc_input"]], col_names = c("species",
                                                                       "taxa_id", 
                                                                       "taxa_lvl", 
                                                                       "kraken_assigned_reads", 
                                                                       "added_reads", 
                                                                       "new_est_reads", 
                                                                       "fraction_total", 
                                                                       "file_path")) %>%
  filter(species != "name")

conc_data_clean <- conc_data %>%
  separate(file_path, c("file", "sample"), sep = "out/") %>%
  separate(sample, c("sample", "junk"), sep = "_bracken") %>%
  select(sample, species, taxa_id, taxa_lvl, kraken_assigned_reads, added_reads, new_est_reads, fraction_total) %>%
  mutate_at(c("taxa_id", "kraken_assigned_reads", "added_reads", "new_est_reads", "fraction_total"), as.numeric)


#### Added Reads Across Samples ####

sample_summary <- conc_data_clean %>%
  group_by(sample) %>%
  summarise(kraken_reads = sum(kraken_assigned_reads), 
            added_reads = sum(added_reads), 
            total_new_reads = sum(new_est_reads))

sample_summary_long <- sample_summary %>%
  pivot_longer(cols = !sample, 
               names_to = "classification_type", 
               values_to = "num_reads") 

sample_summary_plot <- sample_summary %>%
  pivot_longer(cols = !sample, 
               names_to = "classification_type", 
               values_to = "num_reads") %>%
  filter(classification_type != "total_new_reads") %>%
  ggplot() + 
  aes(reorder(x = sample, desc(num_reads)), y = num_reads, fill = classification_type) + 
  geom_bar(stat = "identity", position = "stack") + 
  coord_flip() + 
  scale_fill_manual(values = cbPalette) + 
  theme_classic(base_size = 25) + 
  xlab("Sample") + 
  ylab("Number of Reads")

ggsave(snakemake@output[["added_reads_plot"]], height = 15, width = 20)

#### Stratified Heatmap of Relative Abundance Per Per Sample ####

conc_data_clean$species <- sub(" ", "_", conc_data_clean$species)

conc_data_clean_sep <- conc_data_clean %>%
  separate(species, into = c("Genus", "Species"), sep = "_") %>%
  filter(kraken_assigned_reads > filter)

conc_data_clean_genus_group <- conc_data_clean_sep %>%
  group_by(Genus) %>%
  summarise(n())

plot_stratified_heatmap <- function(){
  for(i in conc_data_clean_genus_group$Genus) {
  print(
    conc_data_clean_sep %>%
      filter(Genus == i) %>% 
      ggplot() + 
      aes(x = sample, y = Species) + 
      geom_tile(aes(fill = new_est_reads), linejoin = "round") + 
      scale_fill_gradientn(colours = c("white", "#F0E442", "#E69F00")) +
      facet_grid(rows = "Genus") + 
      theme_classic(base_size = 25) + 
      theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + 
      xlab("Sample"))
  }
}

pdf(snakemake@output[["bracken_stratified_heatmap"]], height = 20, width = 20)
plot_stratified_heatmap() 
dev.off()

##### Heatmap of Species #####
plot_heatmap_per_sample <- function(data, filter_thresh){
  data %>%
    filter(new_est_reads > filter_thresh) %>%
    select(sample, species, fraction_total) %>%
    ggplot() +
    aes(x = sample, y = species) +
    geom_tile(aes(fill = log(fraction_total))) + 
    scale_fill_gradientn(colours = c("white", "orange", "blue")) + 
    theme_classic() + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5)) + 
    ggtitle(paste("Log Relative Abundance of Microbial Species filter: Filtered > ", filter_thresh, sep = ""))
}

pdf(snakemake@output[["bracken_overall_heatmap"]], height = 20, width = 20)
plot_heatmap_per_sample(conc_data_clean, filter)
dev.off()

R ggplot2 tidyverse From line 3 of scripts/plot_bracken_results.R

library(tidyverse)
library(MetBrewer)

# Load Appropriate Columns 
columns <- c("bin", "marker_lineage", "num_genomes", "num_markers", "num_marker_sets", "completeness", "contamination",
             "strain_heterogen", "genome_size_bp", "num_ambiguous_bases", "num_scaffolds", "num_contigs", "N50_scaffold", "N50_contig",
             "mean_scaffold_length", "mean_contig_length", "longest_scaffold", "longest_contig", 
             "GC", "GC_std", "coding_density", "translation_table", "num_predicted_genes", "0", "1", "2", "3", "4", "5+", "sample")

# Read Data, skip first row and remove the redundant header columns from previous step
bin_stats <- read_tsv(snakemake@input[["checkm_conc_result"]], col_names = columns, skip = 1) %>%
  filter(bin != "Bin Id")

# Conver to Numeric Columns
bin_stats$completeness <- as.numeric(bin_stats$completeness)
bin_stats$contamination <- as.numeric(bin_stats$contamination)
bin_stats$genome_size_bp <- as.numeric(bin_stats$genome_size_bp)
bin_stats$num_contigs <- as.numeric(bin_stats$num_contigs)
bin_stats$mean_contig_length <- as.numeric(bin_stats$mean_contig_length)
bin_stats$N50_contig <- as.numeric(bin_stats$N50_contig)

# Get breaks for plotting 
bin_stats$col <- cut(bin_stats$completeness, 
                        breaks = c(-Inf, 80, Inf), 
                        labels = c("<80", ">=80"))

bin_stats$contig_cut <- cut(bin_stats$num_contigs, 
                     breaks = c(-Inf, 250, Inf), 
                     labels = c("<=250", ">250"))

# Plot Completeness Vs Contamination Per Sample
CompletenessVsContam <- bin_stats %>%
  ggplot() + 
  aes(x = completeness, y = contamination, size = genome_size_bp, col = col) %>%
  geom_point(alpha = 0.5) + 
  scale_x_continuous(limits = c(0, 100)) +
  theme_minimal() + 
  scale_color_manual(values = c("blue", "orange"))

pdf(snakemake@output[["CompletenessVsContam"]])
print(CompletenessVsContam)
dev.off()

#### Plot Completeness Vs Contamination Per Sample
bin_stats_sample_split <- bin_stats %>%
  separate(bin, into = c("sample", "bin_num"), sep = "\\.")

compvscontam_samp <- bin_stats_sample_split %>%
  ggplot() + 
  aes(x = completeness, y = contamination, col = sample) %>%
  geom_point(alpha = 0.8, size = 4) + 
  scale_x_continuous(limits = c(0, 100 )) +
  geom_vline( xintercept = 80, linetype = "dotted", 
              color = "green", size = 1) + 
  geom_vline(xintercept = 50, linetype = "dotted", 
             color = "orange", size = 1) + 
  theme_bw(base_size = 18) + 
  scale_color_manual(values = met.brewer("Redon", 10, type = "discrete")) + 
  xlab("Completeness") + 
  ylab("Contamination") + 
  labs(colour = "Sample") + 
  theme(legend.position = "bottom", 
        legend.direction = "horizontal", 
        legend.text = element_text(size = 10))

pdf(snakemake@output[["CompletenessVsContam_Per_Sample"]])
print(compvscontam_samp)
dev.off()

#### Plot Bar Chart with Completeness vs Contamination Per Sample
bin_stats_bar <- bin_stats_sample_split %>%
  unite("sample_bin", sample:bin_num) %>%
  select(completeness, contamination, sample_bin) %>%
  pivot_longer(c(completeness, contamination), 
              names_to = "bin_rank", 
              values_to = "value") %>%
  ggplot() + 
  aes(reorder(x = sample_bin, desc(value)), y = value, fill = bin_rank) + 
  geom_bar(stat = "identity", position = "dodge") + 
  theme_bw(base_size = 15) + 
  coord_flip() + 
  scale_fill_manual(values = met.brewer("Redon", 2, type = "discrete"), labels = c("Completeness", "Contamination")) + 
  geom_hline(yintercept = 80, linetype = "dotted", 
              color = "green", size = 1) + 
  geom_hline(yintercept = 50, linetype = "dotted", 
             color = "orange", size = 1) + 
  xlab("Sample") + 
  ylab("Contamination and Completion Score (%)") + 
  labs(fill = "CheckM Completion vs Contamination Score") + 
  theme(legend.position = "bottom", 
        legend.direction = "vertical")

pdf(snakemake@output[["BarChartCompletenessContamination"]])
print(bin_stats_bar)
dev.off()

#### Plot the number of contigs vs completeness
NumContigsVsCompleteness <- bin_stats %>%
  ggplot() + 
  aes(x = num_contigs, y = completeness, size = mean_contig_length, col = contig_cut) + 
  geom_point(alpha = 0.5) + 
  theme_minimal() + 
  scale_color_manual(values = c("blue", "orange"))

pdf(snakemake@output[["NumContigsVsCompleteness"]])
print(NumContigsVsCompleteness)
dev.off()

R ggplot2 tidyverse MetBrewer From line 2 of scripts/plot_checkm_QA.R

library(tidyverse)
library(MetBrewer)

#### Import Kraken Data
concatenated_kraken_summaries <- read_tsv(snakemake@input[["concatenated_kraken_summary"]], 
                                            col_names = c("proportion_of_reads", 
                                                        "number_of_reads", 
                                                        "number_of_reads2",
                                                        "read_code", 
                                                        "read_code2", 
                                                        "classification_status", 
                                                        "file_path"))

concatenated_kraken_summaries_clean <- concatenated_kraken_summaries %>%
  separate(file_path, into = c("junk", "filename"), sep = "classified_summary/") %>%
  separate(filename, into = c("sample", "junk2"), sep = "_classified_summary.txt") %>%
  select(sample, proportion_of_reads, number_of_reads, classification_status) 


#### Plot Proportion of Classified vs Unclassified Reads from Kraken
plot_kraken_proportion <- function(kraken_table){

  concatenated_kraken_summaries_clean_plot <- kraken_table %>%
    ggplot() + 
    aes(x = sample, y = number_of_reads, fill = classification_status) + 
    geom_bar(stat = "identity", position = "fill") + 
    theme_bw(base_size = 15) + 
    scale_fill_manual(values = met.brewer("Redon", n = 2, type = "continuous"), 
                      labels = c("Classified Reads", "Unclassified Reads")) + 
    xlab("Sample") + 
    ylab("Proportion of Reads") + 
    theme(legend.title = element_blank(), 
          axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1), 
          legend.position = "top")

  return(concatenated_kraken_summaries_clean_plot)

}

kraken_classification_proportions <- plot_kraken_proportion(concatenated_kraken_summaries_clean)

#### Save file
ggsave(snakemake@output[["classified_proportions"]], kraken_classification_proportions)

R ggplot2 tidyverse MetBrewer From line 2 of scripts/plot_classifiedVsunclassified_reads.R

library(tidyverse)
library(MetBrewer)

##### Import Data #####

spp_tbl <- read_tsv(snakemake@input[["spp_table"]])
genus_tbl <- read_tsv(snakemake@input[["genus_table"]]) 

##### Define User Read Threshold #####
filter_threshold <- as.character(snakemake@params[["strat_thresh"]])
filter_threshold <- as.numeric(filter_threshold)

genus_thres <- as.character(snakemake@params[["genus_thresh"]])
genus_thres <- as.numeric(genus_thres)

spp_thres <- as.character(snakemake@params[["species_thresh"]])
spp_thres <- as.numeric(spp_thres)

##### Clean and Prep Data #####
genus_tbl_clean <- genus_tbl %>%
  separate(study, into = c("sample", "junk"), sep = "_") %>%
  select(!junk) %>%
  rename("phylo_level" = genus)

spp_tbl_clean <- spp_tbl %>%
  separate(study, into = c("sample", "junk"), sep = "_") %>%
  select(!junk) %>%
  rename("phylo_level" = species)

##### Plot Function #####

kraken_heatmap <- function(clean_kraken_table, read_thresh, type){
  heatmap <- clean_kraken_table %>%
    filter(read_number > read_thresh) %>%
    filter(phylo_level != "Homo_sapiens") %>%
    ggplot() + 
    aes(x = sample, y = phylo_level) + 
    geom_tile(aes(fill = log(read_number)), linejoin = "round") + 
    scale_fill_gradientn(colours = met.brewer("Hiroshige", 4, type = "discrete")) +
    theme_bw(base_size = 20) +
    theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1), 
          legend.position = "top", 
          legend.text = element_text(size = 15), 
          legend.title = element_text(size = 15)) + 
    labs(fill = "Read Number (log)") + 
    xlab("Sample") + 
    ylab(type)

  return(heatmap)

}

species_heatmap <- kraken_heatmap(spp_tbl_clean, spp_thres, "Species")
genus_heatmap <- kraken_heatmap(genus_tbl_clean, genus_thres, "Genus")

ggsave(snakemake@output[["genus_heatmap"]], genus_heatmap, height = 20, width = 15)
ggsave(snakemake@output[["species_heatmap"]], species_heatmap, height = 20, width = 15)

##### Faceted Plot By Genus #####
spp_tbl_split <- spp_tbl_clean %>% 
  separate(phylo_level, into = c("Genus", "Species1", "Species2"), sep = "_") %>%
  unite("Species", Species1:Species2, na.rm = TRUE) 

genus_list <- spp_tbl_split %>%
  group_by(Genus) %>%
  summarise(n()) 

plot_species_facets <- function() {
  for(i in genus_list$Genus) {
    print(
      spp_tbl_split %>%
        filter(Genus == i) %>% 
        ggplot() + 
        aes(x = sample, y = Species) + 
        geom_tile(aes(fill = read_number), linejoin = "round") + 
        scale_fill_gradientn(colours = c("white", "#F0E442", "#E69F00")) +
        facet_grid(rows = "Genus") + 
        theme_classic(base_size = 25) + 
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
        )
    }
}

pdf(snakemake@output[["stratified_heatmap"]], height = 20, width = 20)
plot_species_facets()
dev.off()

R ggplot2 tidyverse MetBrewer From line 2 of scripts/plot_kraken_heatmaps.R

Workflow Steps and Code Snippets

WES analysis pipeline

Finding cryptic relationships to boost disease gene detection (v0.3.0dev1-1)

Pipeline to pull microbial reads from WGS data and perform metagenomic analysis (v1.1)

ggplot2