Workflow Steps and Code Snippets
1022 tagged steps and code snippets that match keyword ggplot2
WES analysis pipeline
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | Sys.setenv("TZDIR"=paste0(Sys.getenv("CONDA_PREFIX"), "/share/zoneinfo")) library(readr) library(dplyr) library(ggplot2) df <- read_csv(snakemake@input[[1]]) df <- df %>% mutate(sample_type = factor(sample_type, levels = c("normal", "tumor"))) p <- df %>% ggplot(aes(sample_type, mean)) + geom_boxplot(aes(fill = sample_type)) + labs(x = "", y = "Average Depth") + guides(fill = guide_legend(title="")) + ggtitle("Sequencing Depths") ggsave(snakemake@output[[1]], p) |
Finding cryptic relationships to boost disease gene detection (v0.3.0dev1-1)
13 14 15 | library(ggplot2) library(dplyr) library(tribes.tools) |
56 57 58 59 60 61 62 63 64 65 | #make plot comparing estimated to true degree - excluding unrelated as it distorts scale print( ggplot(estimatedVsTruth, aes(Degree, EstDegree)) + geom_point(aes(size = n)) + geom_text(aes(label=n),hjust=0.5, vjust=-1, colour='red') + theme_bw() + labs(title="Predicted vs True with UR", y="Estimated degree of relationship", x="Degree of reported relationship") ) |
70 71 72 73 74 75 76 77 78 79 | #make plot comparing estimated to true degree - excluding unrelated as it distorts scale print( ggplot(estimatedVsTruth %>% filter(Degree != "UR") , aes(Degree, EstDegree)) + geom_point(aes(size = n)) + geom_text(aes(label=n),hjust=0.5, vjust=-1, colour='red') + theme_bw() + labs(title="Predicted vs True no UR", y="Estimated degree of relationship", x="Degree of reported relationship") ) |
Pipeline to pull microbial reads from WGS data and perform metagenomic analysis (v1.1)
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | library(tidyverse) library(MetBrewer) #### Load Data #### concatenated_alignments <- read_tsv(snakemake@input[["alignment_stats"]], col_names = c("SN", "sequence", "num_sequences", "file_samples")) concatenated_alignments$file_samples[concatenated_alignments$file_samples == "# excluding supplementary and secondary reads"] <- "" #### Get List of Sample Names #### sample_names <- concatenated_alignments %>% separate(file_samples, c("file", "sample"), sep = "stats_ffq/") %>% separate(sample, c("sample", "junk"), sep = "_align") %>% select(sequence, num_sequences, sample) %>% group_by(sample) %>% summarise(n()) %>% select(sample) %>% filter(sample != "NA") sample_names <- as.vector(sample_names$sample) #### Clean Data #### concatenated_alignments_clean <- concatenated_alignments %>% select(!SN) %>% separate(file_samples, c("file", "sample"), sep = "stats_ffq/") %>% separate(sample, c("sample", "junk"), sep = "_align") %>% select(sequence, num_sequences, sample) %>% mutate(sample = replace(sample, is.na(sample), sample_names)) ##### Generate Summary Tables ###### # Wider, 'human readable' format with proportions concatenated_alignments_clean_wide <- concatenated_alignments_clean %>% group_by(sample) %>% pivot_wider(names_from = sequence, values_from = num_sequences) %>% mutate(proportion_mapped = `reads mapped:` / `raw total sequences:` * 100) write.table(concatenated_alignments_clean_wide, file = snakemake@output[["human_read_tbl"]], sep = "\t", row.names = FALSE) write.table(concatenated_alignments_clean, file = snakemake@output[["long_read_tbl"]], sep = "\t", row.names = FALSE) #### Plot Summaries #### alignment_statistics_plot <- concatenated_alignments_clean %>% ggplot() + aes(x = sample, y = num_sequences, fill = sequence) %>% geom_bar(stat = "identity", position = "dodge") + theme_bw(base_size = 15) + scale_fill_manual(values = met.brewer("Redon", n = 3, type = "discrete"), labels = c("Raw Read Number", "Reads Mapped", "Reads Unmapped")) + xlab("Sample") + ylab("Read Number") + theme(legend.title = element_blank(), axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1), legend.position = "top") ggsave(snakemake@output[["alignment_stat_plot"]], alignment_statistics_plot) |
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | library(tidyverse) genus_delim <- function(){ if(snakemake@params[["kraken_db_type"]]){ return("g__") }else{ return("g__g__") } } spp_delim <- function(){ if(snakemake@params[["kraken_db_type"]]){ return("s__") }else{ return("s__s__") } } genus <- genus_delim() spp <- spp_delim() data <- read_tsv(snakemake@input[["combined_report"]]) ##### Get Domain ##### kingdom_data <- data %>% filter(!grepl("p__|g__|s__|c__|o__|f__", `#Classification`)) %>% separate(`#Classification`, c("junk", "kingdom"), sep = "^k__") %>% select(!junk) kingdom_data_long <- kingdom_data %>% pivot_longer(cols = !kingdom, names_to = "study", values_to = "read_number") ##### Get genus ##### genus_data <- data %>% filter(grepl("g__", `#Classification`)) %>% separate(`#Classification`, c("junk", "genus"), sep = genus) %>% select(!junk) %>% filter(!grepl("s__", genus)) genus_data_long <- genus_data %>% pivot_longer(cols = !genus, names_to = "study", values_to = "read_number") ##### Get species ##### species_data <- data %>% filter(grepl("s__", `#Classification`)) %>% separate(`#Classification`, c("junk", "species"), sep = spp) %>% select(!junk) species_data_long <- species_data %>% pivot_longer(cols = !species, names_to = "study", values_to = "read_number") ##### How many reads per study? ##### total_reads_per_study <- species_data_long %>% group_by(study) %>% summarise(total_reads = sum(read_number)) classified_reads_plot <- total_reads_per_study %>% ggplot() + aes(x = study, y = total_reads) + geom_col() + theme_classic() + coord_flip() ##### Species Classification Passed a specific threshold (unfinished) ##### species_data_filtered <- species_data_long %>% filter(read_number > 1000) genus_data_filtered <- genus_data_long %>% filter(read_number > 1000) #### Export Dataframes to TSV files #### write.table(kingdom_data_long, file = snakemake@output[["kingdom_table"]], sep = "\t", row.names = FALSE) write.table(genus_data_long, file = snakemake@output[["genus_table"]], sep = "\t", row.names = FALSE) write.table(species_data_long, file = snakemake@output[["spp_table"]], sep = "\t", row.names = FALSE) write.table(total_reads_per_study, file = snakemake@output[["classified_reads"]], sep = "\t", row.names = FALSE) #### Export Classified Reads Plot #### pdf(snakemake@output[["classified_reads_plot"]]) print(classified_reads_plot) dev.off() |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | library(tidyverse) # Get colour pallette cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # Set Read Filter filter <- as.character(snakemake@params[["bracken_threshold"]]) filter <- as.numeric(filter) #### Load Data ##### conc_data <- read_tsv(snakemake@input[["conc_input"]], col_names = c("species", "taxa_id", "taxa_lvl", "kraken_assigned_reads", "added_reads", "new_est_reads", "fraction_total", "file_path")) %>% filter(species != "name") conc_data_clean <- conc_data %>% separate(file_path, c("file", "sample"), sep = "out/") %>% separate(sample, c("sample", "junk"), sep = "_bracken") %>% select(sample, species, taxa_id, taxa_lvl, kraken_assigned_reads, added_reads, new_est_reads, fraction_total) %>% mutate_at(c("taxa_id", "kraken_assigned_reads", "added_reads", "new_est_reads", "fraction_total"), as.numeric) #### Added Reads Across Samples #### sample_summary <- conc_data_clean %>% group_by(sample) %>% summarise(kraken_reads = sum(kraken_assigned_reads), added_reads = sum(added_reads), total_new_reads = sum(new_est_reads)) sample_summary_long <- sample_summary %>% pivot_longer(cols = !sample, names_to = "classification_type", values_to = "num_reads") sample_summary_plot <- sample_summary %>% pivot_longer(cols = !sample, names_to = "classification_type", values_to = "num_reads") %>% filter(classification_type != "total_new_reads") %>% ggplot() + aes(reorder(x = sample, desc(num_reads)), y = num_reads, fill = classification_type) + geom_bar(stat = "identity", position = "stack") + coord_flip() + scale_fill_manual(values = cbPalette) + theme_classic(base_size = 25) + xlab("Sample") + ylab("Number of Reads") ggsave(snakemake@output[["added_reads_plot"]], height = 15, width = 20) #### Stratified Heatmap of Relative Abundance Per Per Sample #### conc_data_clean$species <- sub(" ", "_", conc_data_clean$species) conc_data_clean_sep <- conc_data_clean %>% separate(species, into = c("Genus", "Species"), sep = "_") %>% filter(kraken_assigned_reads > filter) conc_data_clean_genus_group <- conc_data_clean_sep %>% group_by(Genus) %>% summarise(n()) plot_stratified_heatmap <- function(){ for(i in conc_data_clean_genus_group$Genus) { print( conc_data_clean_sep %>% filter(Genus == i) %>% ggplot() + aes(x = sample, y = Species) + geom_tile(aes(fill = new_est_reads), linejoin = "round") + scale_fill_gradientn(colours = c("white", "#F0E442", "#E69F00")) + facet_grid(rows = "Genus") + theme_classic(base_size = 25) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + xlab("Sample")) } } pdf(snakemake@output[["bracken_stratified_heatmap"]], height = 20, width = 20) plot_stratified_heatmap() dev.off() ##### Heatmap of Species ##### plot_heatmap_per_sample <- function(data, filter_thresh){ data %>% filter(new_est_reads > filter_thresh) %>% select(sample, species, fraction_total) %>% ggplot() + aes(x = sample, y = species) + geom_tile(aes(fill = log(fraction_total))) + scale_fill_gradientn(colours = c("white", "orange", "blue")) + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5)) + ggtitle(paste("Log Relative Abundance of Microbial Species filter: Filtered > ", filter_thresh, sep = "")) } pdf(snakemake@output[["bracken_overall_heatmap"]], height = 20, width = 20) plot_heatmap_per_sample(conc_data_clean, filter) dev.off() |
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | library(tidyverse) library(MetBrewer) # Load Appropriate Columns columns <- c("bin", "marker_lineage", "num_genomes", "num_markers", "num_marker_sets", "completeness", "contamination", "strain_heterogen", "genome_size_bp", "num_ambiguous_bases", "num_scaffolds", "num_contigs", "N50_scaffold", "N50_contig", "mean_scaffold_length", "mean_contig_length", "longest_scaffold", "longest_contig", "GC", "GC_std", "coding_density", "translation_table", "num_predicted_genes", "0", "1", "2", "3", "4", "5+", "sample") # Read Data, skip first row and remove the redundant header columns from previous step bin_stats <- read_tsv(snakemake@input[["checkm_conc_result"]], col_names = columns, skip = 1) %>% filter(bin != "Bin Id") # Conver to Numeric Columns bin_stats$completeness <- as.numeric(bin_stats$completeness) bin_stats$contamination <- as.numeric(bin_stats$contamination) bin_stats$genome_size_bp <- as.numeric(bin_stats$genome_size_bp) bin_stats$num_contigs <- as.numeric(bin_stats$num_contigs) bin_stats$mean_contig_length <- as.numeric(bin_stats$mean_contig_length) bin_stats$N50_contig <- as.numeric(bin_stats$N50_contig) # Get breaks for plotting bin_stats$col <- cut(bin_stats$completeness, breaks = c(-Inf, 80, Inf), labels = c("<80", ">=80")) bin_stats$contig_cut <- cut(bin_stats$num_contigs, breaks = c(-Inf, 250, Inf), labels = c("<=250", ">250")) # Plot Completeness Vs Contamination Per Sample CompletenessVsContam <- bin_stats %>% ggplot() + aes(x = completeness, y = contamination, size = genome_size_bp, col = col) %>% geom_point(alpha = 0.5) + scale_x_continuous(limits = c(0, 100)) + theme_minimal() + scale_color_manual(values = c("blue", "orange")) pdf(snakemake@output[["CompletenessVsContam"]]) print(CompletenessVsContam) dev.off() #### Plot Completeness Vs Contamination Per Sample bin_stats_sample_split <- bin_stats %>% separate(bin, into = c("sample", "bin_num"), sep = "\\.") compvscontam_samp <- bin_stats_sample_split %>% ggplot() + aes(x = completeness, y = contamination, col = sample) %>% geom_point(alpha = 0.8, size = 4) + scale_x_continuous(limits = c(0, 100 )) + geom_vline( xintercept = 80, linetype = "dotted", color = "green", size = 1) + geom_vline(xintercept = 50, linetype = "dotted", color = "orange", size = 1) + theme_bw(base_size = 18) + scale_color_manual(values = met.brewer("Redon", 10, type = "discrete")) + xlab("Completeness") + ylab("Contamination") + labs(colour = "Sample") + theme(legend.position = "bottom", legend.direction = "horizontal", legend.text = element_text(size = 10)) pdf(snakemake@output[["CompletenessVsContam_Per_Sample"]]) print(compvscontam_samp) dev.off() #### Plot Bar Chart with Completeness vs Contamination Per Sample bin_stats_bar <- bin_stats_sample_split %>% unite("sample_bin", sample:bin_num) %>% select(completeness, contamination, sample_bin) %>% pivot_longer(c(completeness, contamination), names_to = "bin_rank", values_to = "value") %>% ggplot() + aes(reorder(x = sample_bin, desc(value)), y = value, fill = bin_rank) + geom_bar(stat = "identity", position = "dodge") + theme_bw(base_size = 15) + coord_flip() + scale_fill_manual(values = met.brewer("Redon", 2, type = "discrete"), labels = c("Completeness", "Contamination")) + geom_hline(yintercept = 80, linetype = "dotted", color = "green", size = 1) + geom_hline(yintercept = 50, linetype = "dotted", color = "orange", size = 1) + xlab("Sample") + ylab("Contamination and Completion Score (%)") + labs(fill = "CheckM Completion vs Contamination Score") + theme(legend.position = "bottom", legend.direction = "vertical") pdf(snakemake@output[["BarChartCompletenessContamination"]]) print(bin_stats_bar) dev.off() #### Plot the number of contigs vs completeness NumContigsVsCompleteness <- bin_stats %>% ggplot() + aes(x = num_contigs, y = completeness, size = mean_contig_length, col = contig_cut) + geom_point(alpha = 0.5) + theme_minimal() + scale_color_manual(values = c("blue", "orange")) pdf(snakemake@output[["NumContigsVsCompleteness"]]) print(NumContigsVsCompleteness) dev.off() |
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | library(tidyverse) library(MetBrewer) #### Import Kraken Data concatenated_kraken_summaries <- read_tsv(snakemake@input[["concatenated_kraken_summary"]], col_names = c("proportion_of_reads", "number_of_reads", "number_of_reads2", "read_code", "read_code2", "classification_status", "file_path")) concatenated_kraken_summaries_clean <- concatenated_kraken_summaries %>% separate(file_path, into = c("junk", "filename"), sep = "classified_summary/") %>% separate(filename, into = c("sample", "junk2"), sep = "_classified_summary.txt") %>% select(sample, proportion_of_reads, number_of_reads, classification_status) #### Plot Proportion of Classified vs Unclassified Reads from Kraken plot_kraken_proportion <- function(kraken_table){ concatenated_kraken_summaries_clean_plot <- kraken_table %>% ggplot() + aes(x = sample, y = number_of_reads, fill = classification_status) + geom_bar(stat = "identity", position = "fill") + theme_bw(base_size = 15) + scale_fill_manual(values = met.brewer("Redon", n = 2, type = "continuous"), labels = c("Classified Reads", "Unclassified Reads")) + xlab("Sample") + ylab("Proportion of Reads") + theme(legend.title = element_blank(), axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1), legend.position = "top") return(concatenated_kraken_summaries_clean_plot) } kraken_classification_proportions <- plot_kraken_proportion(concatenated_kraken_summaries_clean) #### Save file ggsave(snakemake@output[["classified_proportions"]], kraken_classification_proportions) |
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | library(tidyverse) library(MetBrewer) ##### Import Data ##### spp_tbl <- read_tsv(snakemake@input[["spp_table"]]) genus_tbl <- read_tsv(snakemake@input[["genus_table"]]) ##### Define User Read Threshold ##### filter_threshold <- as.character(snakemake@params[["strat_thresh"]]) filter_threshold <- as.numeric(filter_threshold) genus_thres <- as.character(snakemake@params[["genus_thresh"]]) genus_thres <- as.numeric(genus_thres) spp_thres <- as.character(snakemake@params[["species_thresh"]]) spp_thres <- as.numeric(spp_thres) ##### Clean and Prep Data ##### genus_tbl_clean <- genus_tbl %>% separate(study, into = c("sample", "junk"), sep = "_") %>% select(!junk) %>% rename("phylo_level" = genus) spp_tbl_clean <- spp_tbl %>% separate(study, into = c("sample", "junk"), sep = "_") %>% select(!junk) %>% rename("phylo_level" = species) ##### Plot Function ##### kraken_heatmap <- function(clean_kraken_table, read_thresh, type){ heatmap <- clean_kraken_table %>% filter(read_number > read_thresh) %>% filter(phylo_level != "Homo_sapiens") %>% ggplot() + aes(x = sample, y = phylo_level) + geom_tile(aes(fill = log(read_number)), linejoin = "round") + scale_fill_gradientn(colours = met.brewer("Hiroshige", 4, type = "discrete")) + theme_bw(base_size = 20) + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1), legend.position = "top", legend.text = element_text(size = 15), legend.title = element_text(size = 15)) + labs(fill = "Read Number (log)") + xlab("Sample") + ylab(type) return(heatmap) } species_heatmap <- kraken_heatmap(spp_tbl_clean, spp_thres, "Species") genus_heatmap <- kraken_heatmap(genus_tbl_clean, genus_thres, "Genus") ggsave(snakemake@output[["genus_heatmap"]], genus_heatmap, height = 20, width = 15) ggsave(snakemake@output[["species_heatmap"]], species_heatmap, height = 20, width = 15) ##### Faceted Plot By Genus ##### spp_tbl_split <- spp_tbl_clean %>% separate(phylo_level, into = c("Genus", "Species1", "Species2"), sep = "_") %>% unite("Species", Species1:Species2, na.rm = TRUE) genus_list <- spp_tbl_split %>% group_by(Genus) %>% summarise(n()) plot_species_facets <- function() { for(i in genus_list$Genus) { print( spp_tbl_split %>% filter(Genus == i) %>% ggplot() + aes(x = sample, y = Species) + geom_tile(aes(fill = read_number), linejoin = "round") + scale_fill_gradientn(colours = c("white", "#F0E442", "#E69F00")) + facet_grid(rows = "Genus") + theme_classic(base_size = 25) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) ) } } pdf(snakemake@output[["stratified_heatmap"]], height = 20, width = 20) plot_species_facets() dev.off() |
tool / cran
ggplot2
Create Elegant Data Visualisations Using the Grammar of Graphics: A system for 'declaratively' creating graphics, based on "The Grammar of Graphics". You provide the data, tell 'ggplot2' how to map variables to aesthetics, what graphical primitives to use, and it takes care of the details.