Workflow Steps and Code Snippets

1022 tagged steps and code snippets that match keyword ggplot2

WES analysis pipeline

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
Sys.setenv("TZDIR"=paste0(Sys.getenv("CONDA_PREFIX"), "/share/zoneinfo"))

library(readr)
library(dplyr)
library(ggplot2)

df <- read_csv(snakemake@input[[1]])
df <- df %>%
    mutate(sample_type = factor(sample_type, levels = c("normal", "tumor")))
p <- df %>%
    ggplot(aes(sample_type, mean)) +
    geom_boxplot(aes(fill = sample_type)) +
    labs(x = "", y = "Average Depth") +
    guides(fill = guide_legend(title="")) +
    ggtitle("Sequencing Depths")
ggsave(snakemake@output[[1]], p)
56
57
58
59
60
61
62
63
64
65
#make plot comparing estimated to true degree - excluding unrelated as it distorts scale
print(
    ggplot(estimatedVsTruth, aes(Degree, EstDegree)) 
        + geom_point(aes(size = n)) 
        + geom_text(aes(label=n),hjust=0.5, vjust=-1, colour='red') 
        + theme_bw()
        + labs(title="Predicted vs True with UR",
                        y="Estimated degree of relationship",
                        x="Degree of reported relationship")
)
70
71
72
73
74
75
76
77
78
79
#make plot comparing estimated to true degree - excluding unrelated as it distorts scale
print(
    ggplot(estimatedVsTruth %>% filter(Degree != "UR") , aes(Degree, EstDegree)) 
        + geom_point(aes(size = n)) 
        + geom_text(aes(label=n),hjust=0.5, vjust=-1, colour='red') 
        + theme_bw()
        + labs(title="Predicted vs True no UR",
                        y="Estimated degree of relationship",
                        x="Degree of reported relationship")
)

Pipeline to pull microbial reads from WGS data and perform metagenomic analysis (v1.1)

 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
library(tidyverse)
library(MetBrewer)

#### Load Data ####
concatenated_alignments <- read_tsv(snakemake@input[["alignment_stats"]], col_names = c("SN", "sequence", "num_sequences", "file_samples"))
concatenated_alignments$file_samples[concatenated_alignments$file_samples == "# excluding supplementary and secondary reads"] <- ""

#### Get List of Sample Names ####
sample_names <- concatenated_alignments %>%
  separate(file_samples, c("file", "sample"), sep = "stats_ffq/") %>%
  separate(sample, c("sample", "junk"), sep = "_align") %>%
  select(sequence, num_sequences, sample) %>%
  group_by(sample) %>%
  summarise(n()) %>%
  select(sample) %>%
  filter(sample != "NA")

sample_names <- as.vector(sample_names$sample)

#### Clean Data ####
concatenated_alignments_clean <- concatenated_alignments %>%
  select(!SN) %>%
  separate(file_samples, c("file", "sample"), sep = "stats_ffq/") %>%
  separate(sample, c("sample", "junk"), sep = "_align") %>%
  select(sequence, num_sequences, sample) %>%
  mutate(sample = replace(sample, is.na(sample), sample_names))

##### Generate Summary Tables ######
# Wider, 'human readable' format with proportions
concatenated_alignments_clean_wide <- concatenated_alignments_clean %>% 
  group_by(sample) %>%
  pivot_wider(names_from = sequence, values_from = num_sequences) %>%
  mutate(proportion_mapped = `reads mapped:` / `raw total sequences:` * 100) 

write.table(concatenated_alignments_clean_wide, file = snakemake@output[["human_read_tbl"]], sep = "\t", row.names = FALSE)
write.table(concatenated_alignments_clean, file = snakemake@output[["long_read_tbl"]], sep = "\t", row.names = FALSE)

#### Plot Summaries ####
alignment_statistics_plot <- concatenated_alignments_clean %>%
  ggplot() + 
  aes(x = sample, y = num_sequences, fill = sequence) %>%
  geom_bar(stat = "identity", position = "dodge") + 
  theme_bw(base_size = 15) + 
  scale_fill_manual(values = met.brewer("Redon", n = 3, type = "discrete"), 
                    labels = c("Raw Read Number", "Reads Mapped", "Reads Unmapped")) + 
  xlab("Sample") + 
  ylab("Read Number") + 
  theme(legend.title = element_blank(), 
        axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1), 
        legend.position = "top") 

ggsave(snakemake@output[["alignment_stat_plot"]], alignment_statistics_plot)
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
library(tidyverse)

genus_delim <- function(){
  if(snakemake@params[["kraken_db_type"]]){
    return("g__")
  }else{
    return("g__g__")
  }
}


spp_delim <- function(){
  if(snakemake@params[["kraken_db_type"]]){
    return("s__")
  }else{
    return("s__s__")
  }
}

genus <- genus_delim()
spp <- spp_delim()

data <- read_tsv(snakemake@input[["combined_report"]])

##### Get Domain #####
kingdom_data <- data %>%
  filter(!grepl("p__|g__|s__|c__|o__|f__", `#Classification`)) %>%
  separate(`#Classification`, c("junk", "kingdom"), sep = "^k__") %>%
  select(!junk)

kingdom_data_long <- kingdom_data %>%
  pivot_longer(cols = !kingdom, names_to = "study", values_to = "read_number") 

##### Get genus  #####
genus_data <- data %>%
  filter(grepl("g__", `#Classification`)) %>%
  separate(`#Classification`, c("junk", "genus"), sep = genus) %>%
  select(!junk) %>%
  filter(!grepl("s__", genus)) 

genus_data_long <- genus_data %>%
  pivot_longer(cols = !genus, names_to = "study", values_to = "read_number") 

##### Get species  #####
species_data <- data %>% 
  filter(grepl("s__", `#Classification`)) %>%
  separate(`#Classification`, c("junk", "species"), sep = spp) %>%
  select(!junk)

species_data_long <- species_data %>% 
  pivot_longer(cols = !species, names_to = "study", values_to = "read_number") 

##### How many reads per study? #####
total_reads_per_study <- species_data_long %>% 
  group_by(study) %>%
  summarise(total_reads = sum(read_number)) 

classified_reads_plot <- total_reads_per_study %>% 
  ggplot() + 
  aes(x = study, y = total_reads) + 
  geom_col() + 
  theme_classic() + 
  coord_flip()

##### Species Classification Passed a specific threshold (unfinished) ##### 
species_data_filtered <- species_data_long %>%
  filter(read_number > 1000)

genus_data_filtered <- genus_data_long %>%
  filter(read_number > 1000)

#### Export Dataframes to TSV files ####
write.table(kingdom_data_long, file = snakemake@output[["kingdom_table"]], sep = "\t", row.names = FALSE)
write.table(genus_data_long, file = snakemake@output[["genus_table"]], sep = "\t", row.names = FALSE)
write.table(species_data_long, file = snakemake@output[["spp_table"]], sep = "\t", row.names = FALSE)
write.table(total_reads_per_study, file = snakemake@output[["classified_reads"]], sep = "\t", row.names = FALSE)

#### Export Classified Reads Plot ####
pdf(snakemake@output[["classified_reads_plot"]])
print(classified_reads_plot)
dev.off()
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
library(tidyverse)
# Get colour pallette
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
# Set Read Filter
filter <- as.character(snakemake@params[["bracken_threshold"]])
filter <- as.numeric(filter)

#### Load Data #####
conc_data <- read_tsv(snakemake@input[["conc_input"]], col_names = c("species",
                                                                       "taxa_id", 
                                                                       "taxa_lvl", 
                                                                       "kraken_assigned_reads", 
                                                                       "added_reads", 
                                                                       "new_est_reads", 
                                                                       "fraction_total", 
                                                                       "file_path")) %>%
  filter(species != "name")

conc_data_clean <- conc_data %>%
  separate(file_path, c("file", "sample"), sep = "out/") %>%
  separate(sample, c("sample", "junk"), sep = "_bracken") %>%
  select(sample, species, taxa_id, taxa_lvl, kraken_assigned_reads, added_reads, new_est_reads, fraction_total) %>%
  mutate_at(c("taxa_id", "kraken_assigned_reads", "added_reads", "new_est_reads", "fraction_total"), as.numeric)


#### Added Reads Across Samples ####

sample_summary <- conc_data_clean %>%
  group_by(sample) %>%
  summarise(kraken_reads = sum(kraken_assigned_reads), 
            added_reads = sum(added_reads), 
            total_new_reads = sum(new_est_reads))

sample_summary_long <- sample_summary %>%
  pivot_longer(cols = !sample, 
               names_to = "classification_type", 
               values_to = "num_reads") 

sample_summary_plot <- sample_summary %>%
  pivot_longer(cols = !sample, 
               names_to = "classification_type", 
               values_to = "num_reads") %>%
  filter(classification_type != "total_new_reads") %>%
  ggplot() + 
  aes(reorder(x = sample, desc(num_reads)), y = num_reads, fill = classification_type) + 
  geom_bar(stat = "identity", position = "stack") + 
  coord_flip() + 
  scale_fill_manual(values = cbPalette) + 
  theme_classic(base_size = 25) + 
  xlab("Sample") + 
  ylab("Number of Reads")

ggsave(snakemake@output[["added_reads_plot"]], height = 15, width = 20)

#### Stratified Heatmap of Relative Abundance Per Per Sample ####

conc_data_clean$species <- sub(" ", "_", conc_data_clean$species)

conc_data_clean_sep <- conc_data_clean %>%
  separate(species, into = c("Genus", "Species"), sep = "_") %>%
  filter(kraken_assigned_reads > filter)

conc_data_clean_genus_group <- conc_data_clean_sep %>%
  group_by(Genus) %>%
  summarise(n())

plot_stratified_heatmap <- function(){
  for(i in conc_data_clean_genus_group$Genus) {
  print(
    conc_data_clean_sep %>%
      filter(Genus == i) %>% 
      ggplot() + 
      aes(x = sample, y = Species) + 
      geom_tile(aes(fill = new_est_reads), linejoin = "round") + 
      scale_fill_gradientn(colours = c("white", "#F0E442", "#E69F00")) +
      facet_grid(rows = "Genus") + 
      theme_classic(base_size = 25) + 
      theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + 
      xlab("Sample"))
  }
}

pdf(snakemake@output[["bracken_stratified_heatmap"]], height = 20, width = 20)
plot_stratified_heatmap() 
dev.off()

##### Heatmap of Species #####
plot_heatmap_per_sample <- function(data, filter_thresh){
  data %>%
    filter(new_est_reads > filter_thresh) %>%
    select(sample, species, fraction_total) %>%
    ggplot() +
    aes(x = sample, y = species) +
    geom_tile(aes(fill = log(fraction_total))) + 
    scale_fill_gradientn(colours = c("white", "orange", "blue")) + 
    theme_classic() + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5)) + 
    ggtitle(paste("Log Relative Abundance of Microbial Species filter: Filtered > ", filter_thresh, sep = ""))
}

pdf(snakemake@output[["bracken_overall_heatmap"]], height = 20, width = 20)
plot_heatmap_per_sample(conc_data_clean, filter)
dev.off()
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
library(tidyverse)
library(MetBrewer)

# Load Appropriate Columns 
columns <- c("bin", "marker_lineage", "num_genomes", "num_markers", "num_marker_sets", "completeness", "contamination",
             "strain_heterogen", "genome_size_bp", "num_ambiguous_bases", "num_scaffolds", "num_contigs", "N50_scaffold", "N50_contig",
             "mean_scaffold_length", "mean_contig_length", "longest_scaffold", "longest_contig", 
             "GC", "GC_std", "coding_density", "translation_table", "num_predicted_genes", "0", "1", "2", "3", "4", "5+", "sample")

# Read Data, skip first row and remove the redundant header columns from previous step
bin_stats <- read_tsv(snakemake@input[["checkm_conc_result"]], col_names = columns, skip = 1) %>%
  filter(bin != "Bin Id")

# Conver to Numeric Columns
bin_stats$completeness <- as.numeric(bin_stats$completeness)
bin_stats$contamination <- as.numeric(bin_stats$contamination)
bin_stats$genome_size_bp <- as.numeric(bin_stats$genome_size_bp)
bin_stats$num_contigs <- as.numeric(bin_stats$num_contigs)
bin_stats$mean_contig_length <- as.numeric(bin_stats$mean_contig_length)
bin_stats$N50_contig <- as.numeric(bin_stats$N50_contig)

# Get breaks for plotting 
bin_stats$col <- cut(bin_stats$completeness, 
                        breaks = c(-Inf, 80, Inf), 
                        labels = c("<80", ">=80"))

bin_stats$contig_cut <- cut(bin_stats$num_contigs, 
                     breaks = c(-Inf, 250, Inf), 
                     labels = c("<=250", ">250"))

# Plot Completeness Vs Contamination Per Sample
CompletenessVsContam <- bin_stats %>%
  ggplot() + 
  aes(x = completeness, y = contamination, size = genome_size_bp, col = col) %>%
  geom_point(alpha = 0.5) + 
  scale_x_continuous(limits = c(0, 100)) +
  theme_minimal() + 
  scale_color_manual(values = c("blue", "orange"))

pdf(snakemake@output[["CompletenessVsContam"]])
print(CompletenessVsContam)
dev.off()

#### Plot Completeness Vs Contamination Per Sample
bin_stats_sample_split <- bin_stats %>%
  separate(bin, into = c("sample", "bin_num"), sep = "\\.")

compvscontam_samp <- bin_stats_sample_split %>%
  ggplot() + 
  aes(x = completeness, y = contamination, col = sample) %>%
  geom_point(alpha = 0.8, size = 4) + 
  scale_x_continuous(limits = c(0, 100 )) +
  geom_vline( xintercept = 80, linetype = "dotted", 
              color = "green", size = 1) + 
  geom_vline(xintercept = 50, linetype = "dotted", 
             color = "orange", size = 1) + 
  theme_bw(base_size = 18) + 
  scale_color_manual(values = met.brewer("Redon", 10, type = "discrete")) + 
  xlab("Completeness") + 
  ylab("Contamination") + 
  labs(colour = "Sample") + 
  theme(legend.position = "bottom", 
        legend.direction = "horizontal", 
        legend.text = element_text(size = 10))

pdf(snakemake@output[["CompletenessVsContam_Per_Sample"]])
print(compvscontam_samp)
dev.off()

#### Plot Bar Chart with Completeness vs Contamination Per Sample
bin_stats_bar <- bin_stats_sample_split %>%
  unite("sample_bin", sample:bin_num) %>%
  select(completeness, contamination, sample_bin) %>%
  pivot_longer(c(completeness, contamination), 
              names_to = "bin_rank", 
              values_to = "value") %>%
  ggplot() + 
  aes(reorder(x = sample_bin, desc(value)), y = value, fill = bin_rank) + 
  geom_bar(stat = "identity", position = "dodge") + 
  theme_bw(base_size = 15) + 
  coord_flip() + 
  scale_fill_manual(values = met.brewer("Redon", 2, type = "discrete"), labels = c("Completeness", "Contamination")) + 
  geom_hline(yintercept = 80, linetype = "dotted", 
              color = "green", size = 1) + 
  geom_hline(yintercept = 50, linetype = "dotted", 
             color = "orange", size = 1) + 
  xlab("Sample") + 
  ylab("Contamination and Completion Score (%)") + 
  labs(fill = "CheckM Completion vs Contamination Score") + 
  theme(legend.position = "bottom", 
        legend.direction = "vertical")

pdf(snakemake@output[["BarChartCompletenessContamination"]])
print(bin_stats_bar)
dev.off()

#### Plot the number of contigs vs completeness
NumContigsVsCompleteness <- bin_stats %>%
  ggplot() + 
  aes(x = num_contigs, y = completeness, size = mean_contig_length, col = contig_cut) + 
  geom_point(alpha = 0.5) + 
  theme_minimal() + 
  scale_color_manual(values = c("blue", "orange"))

pdf(snakemake@output[["NumContigsVsCompleteness"]])
print(NumContigsVsCompleteness)
dev.off()
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
library(tidyverse)
library(MetBrewer)

#### Import Kraken Data
concatenated_kraken_summaries <- read_tsv(snakemake@input[["concatenated_kraken_summary"]], 
                                            col_names = c("proportion_of_reads", 
                                                        "number_of_reads", 
                                                        "number_of_reads2",
                                                        "read_code", 
                                                        "read_code2", 
                                                        "classification_status", 
                                                        "file_path"))

concatenated_kraken_summaries_clean <- concatenated_kraken_summaries %>%
  separate(file_path, into = c("junk", "filename"), sep = "classified_summary/") %>%
  separate(filename, into = c("sample", "junk2"), sep = "_classified_summary.txt") %>%
  select(sample, proportion_of_reads, number_of_reads, classification_status) 


#### Plot Proportion of Classified vs Unclassified Reads from Kraken
plot_kraken_proportion <- function(kraken_table){

  concatenated_kraken_summaries_clean_plot <- kraken_table %>%
    ggplot() + 
    aes(x = sample, y = number_of_reads, fill = classification_status) + 
    geom_bar(stat = "identity", position = "fill") + 
    theme_bw(base_size = 15) + 
    scale_fill_manual(values = met.brewer("Redon", n = 2, type = "continuous"), 
                      labels = c("Classified Reads", "Unclassified Reads")) + 
    xlab("Sample") + 
    ylab("Proportion of Reads") + 
    theme(legend.title = element_blank(), 
          axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1), 
          legend.position = "top")

  return(concatenated_kraken_summaries_clean_plot)

}

kraken_classification_proportions <- plot_kraken_proportion(concatenated_kraken_summaries_clean)

#### Save file
ggsave(snakemake@output[["classified_proportions"]], kraken_classification_proportions)
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
library(tidyverse)
library(MetBrewer)

##### Import Data #####

spp_tbl <- read_tsv(snakemake@input[["spp_table"]])
genus_tbl <- read_tsv(snakemake@input[["genus_table"]]) 

##### Define User Read Threshold #####
filter_threshold <- as.character(snakemake@params[["strat_thresh"]])
filter_threshold <- as.numeric(filter_threshold)

genus_thres <- as.character(snakemake@params[["genus_thresh"]])
genus_thres <- as.numeric(genus_thres)

spp_thres <- as.character(snakemake@params[["species_thresh"]])
spp_thres <- as.numeric(spp_thres)

##### Clean and Prep Data #####
genus_tbl_clean <- genus_tbl %>%
  separate(study, into = c("sample", "junk"), sep = "_") %>%
  select(!junk) %>%
  rename("phylo_level" = genus)

spp_tbl_clean <- spp_tbl %>%
  separate(study, into = c("sample", "junk"), sep = "_") %>%
  select(!junk) %>%
  rename("phylo_level" = species)

##### Plot Function #####

kraken_heatmap <- function(clean_kraken_table, read_thresh, type){
  heatmap <- clean_kraken_table %>%
    filter(read_number > read_thresh) %>%
    filter(phylo_level != "Homo_sapiens") %>%
    ggplot() + 
    aes(x = sample, y = phylo_level) + 
    geom_tile(aes(fill = log(read_number)), linejoin = "round") + 
    scale_fill_gradientn(colours = met.brewer("Hiroshige", 4, type = "discrete")) +
    theme_bw(base_size = 20) +
    theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1), 
          legend.position = "top", 
          legend.text = element_text(size = 15), 
          legend.title = element_text(size = 15)) + 
    labs(fill = "Read Number (log)") + 
    xlab("Sample") + 
    ylab(type)

  return(heatmap)

}

species_heatmap <- kraken_heatmap(spp_tbl_clean, spp_thres, "Species")
genus_heatmap <- kraken_heatmap(genus_tbl_clean, genus_thres, "Genus")

ggsave(snakemake@output[["genus_heatmap"]], genus_heatmap, height = 20, width = 15)
ggsave(snakemake@output[["species_heatmap"]], species_heatmap, height = 20, width = 15)

##### Faceted Plot By Genus #####
spp_tbl_split <- spp_tbl_clean %>% 
  separate(phylo_level, into = c("Genus", "Species1", "Species2"), sep = "_") %>%
  unite("Species", Species1:Species2, na.rm = TRUE) 

genus_list <- spp_tbl_split %>%
  group_by(Genus) %>%
  summarise(n()) 

plot_species_facets <- function() {
  for(i in genus_list$Genus) {
    print(
      spp_tbl_split %>%
        filter(Genus == i) %>% 
        ggplot() + 
        aes(x = sample, y = Species) + 
        geom_tile(aes(fill = read_number), linejoin = "round") + 
        scale_fill_gradientn(colours = c("white", "#F0E442", "#E69F00")) +
        facet_grid(rows = "Genus") + 
        theme_classic(base_size = 25) + 
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
        )
    }
}

pdf(snakemake@output[["stratified_heatmap"]], height = 20, width = 20)
plot_species_facets()
dev.off()
tool / cran

ggplot2

Create Elegant Data Visualisations Using the Grammar of Graphics: A system for 'declaratively' creating graphics, based on "The Grammar of Graphics". You provide the data, tell 'ggplot2' how to map variables to aesthetics, what graphical primitives to use, and it takes care of the details.