Workflow Steps and Code Snippets

729 tagged steps and code snippets that match keyword tidyverse

Bioinfo Macro Host Genome Short Variant Discovery Workflow

 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
library(getopt)
library(tidyverse)

option_matrix <- matrix(
  c(
    "infile", "i", 1, "character",
    "outfile", "o", 1, "character"
  ),
  byrow = TRUE, ncol = 4
)

options <- getopt(option_matrix)



read_gtcheck <- function(gtcheck_filename) {
  read_tsv(
    file = gtcheck_filename,
    skip = 20,
    col_names = c(
      "dc", "query_sample", "genotyped_sample", "discordance", "log_p_hwe",
      "number_of_sites"
    )
  ) %>%
  select(-dc)
}

create_distance_matrix <- function(gtcheck_long) {
  gtcheck_diagonal <-
    gtcheck_long %>%
    select(query_sample, genotyped_sample) %>%
    mutate(discordance = 0) %>%
    pivot_longer(
      query_sample:genotyped_sample,
      names_to = "type",
      values_to = "query_sample"
    ) %>%
    select(query_sample) %>%
    mutate(
      genotyped_sample = query_sample,
      discordance = 0.0
    ) %>%
    distinct()

  gtcheck_upper <-
    gtcheck_long %>%
    select(query_sample, genotyped_sample, discordance)

  gtcheck_lower <-
    gtcheck_long %>%
    select(
      genotyped_sample = query_sample,
      query_sample = genotyped_sample,
      discordance
    )

  gtcheck_discordances <-
    bind_rows(gtcheck_diagonal, gtcheck_upper, gtcheck_lower) %>%
    arrange(query_sample)

  gtcheck_discordances_matrix <-
    gtcheck_discordances %>%
    pivot_wider(
      names_from = genotyped_sample,
      values_from = discordance
    ) %>%
    column_to_rownames("query_sample") %>%
    as.matrix()

  return(gtcheck_discordances_matrix)
}


if (!interactive()) {
  gtcheck_long <- read_gtcheck(options$infile)
  gtcheck_discordances_matrix <- create_distance_matrix(gtcheck_long)
  pdf(file = options$outfile, paper = "a4")
  heatmap(gtcheck_discordances_matrix)
  dev.off()
}

Repository containing bioinformatic code for macro-scale host transcriptomic data processing (v0.0.1)

 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
library(tidyverse)
library(argparse)

read_star_counts <- function(filename) {
  # Read only the first two columns. The other two are for strand-specific data
  # Extract the file name, since we have to match the Illumina name with the
  # sample name

  star_column_names <- c(
    "gene_id", "unstranded", "stranded_forward", "stranded_reverse"
  )

  filename %>%
    read_tsv(
      col_names = star_column_names,
      col_select = 1:2,
      skip = 4,
      show_col_types = FALSE
    ) %>%
    mutate(
      sample_id = filename %>%
        basename() %>%
        str_remove(".ReadsPerGene.out.tab")
    ) %>%
    select(sample_id, gene_id, counts = unstranded)
}

parser <- ArgumentParser()

parser$add_argument(
  "-i", "--input-folder",
  type = "character",
  dest = "input_folder",
  help = paste(
    "Folder that contains the STAR counts. Will search recursively for ",
    "files ended in \".ReadsPerGene.out.tab\"."
  )
)

parser$add_argument(
  "-o", "--output-file",
  type = "character",
  dest = "output_file",
  help = paste(
    "Output file with all the table containing all the counts together"
  )
)

args <- parser$parse_args()

files <-
  list.files(
    path = args$input_folder,
    pattern = "*.ReadsPerGene.out.tab",
    recursive = TRUE,
    full.names = TRUE
  )

counts_raw <-
  files %>%
  map(read_star_counts) %>%
  bind_rows() %>%
  pivot_wider(names_from = sample_id, values_from = counts)

dir.create(dirname(args$output_file))
write_tsv(counts_raw, args$output_file)

Pipeline to pull microbial reads from WGS data and perform metagenomic analysis (v1.1)

 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
library(tidyverse)
library(MetBrewer)

#### Load Data ####
concatenated_alignments <- read_tsv(snakemake@input[["alignment_stats"]], col_names = c("SN", "sequence", "num_sequences", "file_samples"))
concatenated_alignments$file_samples[concatenated_alignments$file_samples == "# excluding supplementary and secondary reads"] <- ""

#### Get List of Sample Names ####
sample_names <- concatenated_alignments %>%
  separate(file_samples, c("file", "sample"), sep = "stats_ffq/") %>%
  separate(sample, c("sample", "junk"), sep = "_align") %>%
  select(sequence, num_sequences, sample) %>%
  group_by(sample) %>%
  summarise(n()) %>%
  select(sample) %>%
  filter(sample != "NA")

sample_names <- as.vector(sample_names$sample)

#### Clean Data ####
concatenated_alignments_clean <- concatenated_alignments %>%
  select(!SN) %>%
  separate(file_samples, c("file", "sample"), sep = "stats_ffq/") %>%
  separate(sample, c("sample", "junk"), sep = "_align") %>%
  select(sequence, num_sequences, sample) %>%
  mutate(sample = replace(sample, is.na(sample), sample_names))

##### Generate Summary Tables ######
# Wider, 'human readable' format with proportions
concatenated_alignments_clean_wide <- concatenated_alignments_clean %>% 
  group_by(sample) %>%
  pivot_wider(names_from = sequence, values_from = num_sequences) %>%
  mutate(proportion_mapped = `reads mapped:` / `raw total sequences:` * 100) 

write.table(concatenated_alignments_clean_wide, file = snakemake@output[["human_read_tbl"]], sep = "\t", row.names = FALSE)
write.table(concatenated_alignments_clean, file = snakemake@output[["long_read_tbl"]], sep = "\t", row.names = FALSE)

#### Plot Summaries ####
alignment_statistics_plot <- concatenated_alignments_clean %>%
  ggplot() + 
  aes(x = sample, y = num_sequences, fill = sequence) %>%
  geom_bar(stat = "identity", position = "dodge") + 
  theme_bw(base_size = 15) + 
  scale_fill_manual(values = met.brewer("Redon", n = 3, type = "discrete"), 
                    labels = c("Raw Read Number", "Reads Mapped", "Reads Unmapped")) + 
  xlab("Sample") + 
  ylab("Read Number") + 
  theme(legend.title = element_blank(), 
        axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1), 
        legend.position = "top") 

ggsave(snakemake@output[["alignment_stat_plot"]], alignment_statistics_plot)
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
library(tidyverse)

genus_delim <- function(){
  if(snakemake@params[["kraken_db_type"]]){
    return("g__")
  }else{
    return("g__g__")
  }
}


spp_delim <- function(){
  if(snakemake@params[["kraken_db_type"]]){
    return("s__")
  }else{
    return("s__s__")
  }
}

genus <- genus_delim()
spp <- spp_delim()

data <- read_tsv(snakemake@input[["combined_report"]])

##### Get Domain #####
kingdom_data <- data %>%
  filter(!grepl("p__|g__|s__|c__|o__|f__", `#Classification`)) %>%
  separate(`#Classification`, c("junk", "kingdom"), sep = "^k__") %>%
  select(!junk)

kingdom_data_long <- kingdom_data %>%
  pivot_longer(cols = !kingdom, names_to = "study", values_to = "read_number") 

##### Get genus  #####
genus_data <- data %>%
  filter(grepl("g__", `#Classification`)) %>%
  separate(`#Classification`, c("junk", "genus"), sep = genus) %>%
  select(!junk) %>%
  filter(!grepl("s__", genus)) 

genus_data_long <- genus_data %>%
  pivot_longer(cols = !genus, names_to = "study", values_to = "read_number") 

##### Get species  #####
species_data <- data %>% 
  filter(grepl("s__", `#Classification`)) %>%
  separate(`#Classification`, c("junk", "species"), sep = spp) %>%
  select(!junk)

species_data_long <- species_data %>% 
  pivot_longer(cols = !species, names_to = "study", values_to = "read_number") 

##### How many reads per study? #####
total_reads_per_study <- species_data_long %>% 
  group_by(study) %>%
  summarise(total_reads = sum(read_number)) 

classified_reads_plot <- total_reads_per_study %>% 
  ggplot() + 
  aes(x = study, y = total_reads) + 
  geom_col() + 
  theme_classic() + 
  coord_flip()

##### Species Classification Passed a specific threshold (unfinished) ##### 
species_data_filtered <- species_data_long %>%
  filter(read_number > 1000)

genus_data_filtered <- genus_data_long %>%
  filter(read_number > 1000)

#### Export Dataframes to TSV files ####
write.table(kingdom_data_long, file = snakemake@output[["kingdom_table"]], sep = "\t", row.names = FALSE)
write.table(genus_data_long, file = snakemake@output[["genus_table"]], sep = "\t", row.names = FALSE)
write.table(species_data_long, file = snakemake@output[["spp_table"]], sep = "\t", row.names = FALSE)
write.table(total_reads_per_study, file = snakemake@output[["classified_reads"]], sep = "\t", row.names = FALSE)

#### Export Classified Reads Plot ####
pdf(snakemake@output[["classified_reads_plot"]])
print(classified_reads_plot)
dev.off()
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
library(tidyverse)

metaphlan_tbl <- read_tsv(snakemake@input[["concat_tbl"]], skip = 1) %>%
  select(!NCBI_tax_id)

#### Get Kingdom ####
metaphlan_tbl_king <- metaphlan_tbl %>%
  filter(!grepl("p__|g__|s__|c__|o__|f__", clade_name)) %>%
  mutate_at("clade_name", str_replace, "k__", "")

metaphlan_tbl_king_long <- metaphlan_tbl_king %>%
  pivot_longer(cols = !clade_name, 
               names_to = "samples", 
               values_to = "rel_abund")

#### Get Genus ####
metaphlan_tbl_genus <- metaphlan_tbl %>%
  filter(grepl("g__", clade_name)) %>%
  separate(clade_name, c("junk", "genus"), sep = "g__") %>%
  select(!junk) %>%
  filter(!grepl("s__", genus))

metaphlan_tbl_genus_long <- metaphlan_tbl_genus %>% 
  pivot_longer(cols = !genus, 
               names_to = "samples", 
               values_to = "rel_abund")

#### Get Species ####
metaphlan_tbl_spp <- metaphlan_tbl %>%
  filter(grepl("s__", clade_name)) %>%
  separate(clade_name, c("junk", "species"), sep = "s__") %>%
  select(!junk)

metaphlan_tbl_spp_long <- metaphlan_tbl_spp %>% 
  pivot_longer(cols = !species,
               names_to = "samples", 
               values_to = "rel_abund")

#### Export Dataframes to TSV files ####
write.table(metaphlan_tbl_king_long, file = snakemake@output[["kingdom_table"]], sep = "\t", row.names = FALSE)
write.table(metaphlan_tbl_genus_long, file = snakemake@output[["genus_table"]], sep = "\t", row.names = FALSE)
write.table(metaphlan_tbl_spp_long, file = snakemake@output[["spp_table"]], sep = "\t", row.names = FALSE)
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
library(tidyverse)
# Get colour pallette
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
# Set Read Filter
filter <- as.character(snakemake@params[["bracken_threshold"]])
filter <- as.numeric(filter)

#### Load Data #####
conc_data <- read_tsv(snakemake@input[["conc_input"]], col_names = c("species",
                                                                       "taxa_id", 
                                                                       "taxa_lvl", 
                                                                       "kraken_assigned_reads", 
                                                                       "added_reads", 
                                                                       "new_est_reads", 
                                                                       "fraction_total", 
                                                                       "file_path")) %>%
  filter(species != "name")

conc_data_clean <- conc_data %>%
  separate(file_path, c("file", "sample"), sep = "out/") %>%
  separate(sample, c("sample", "junk"), sep = "_bracken") %>%
  select(sample, species, taxa_id, taxa_lvl, kraken_assigned_reads, added_reads, new_est_reads, fraction_total) %>%
  mutate_at(c("taxa_id", "kraken_assigned_reads", "added_reads", "new_est_reads", "fraction_total"), as.numeric)


#### Added Reads Across Samples ####

sample_summary <- conc_data_clean %>%
  group_by(sample) %>%
  summarise(kraken_reads = sum(kraken_assigned_reads), 
            added_reads = sum(added_reads), 
            total_new_reads = sum(new_est_reads))

sample_summary_long <- sample_summary %>%
  pivot_longer(cols = !sample, 
               names_to = "classification_type", 
               values_to = "num_reads") 

sample_summary_plot <- sample_summary %>%
  pivot_longer(cols = !sample, 
               names_to = "classification_type", 
               values_to = "num_reads") %>%
  filter(classification_type != "total_new_reads") %>%
  ggplot() + 
  aes(reorder(x = sample, desc(num_reads)), y = num_reads, fill = classification_type) + 
  geom_bar(stat = "identity", position = "stack") + 
  coord_flip() + 
  scale_fill_manual(values = cbPalette) + 
  theme_classic(base_size = 25) + 
  xlab("Sample") + 
  ylab("Number of Reads")

ggsave(snakemake@output[["added_reads_plot"]], height = 15, width = 20)

#### Stratified Heatmap of Relative Abundance Per Per Sample ####

conc_data_clean$species <- sub(" ", "_", conc_data_clean$species)

conc_data_clean_sep <- conc_data_clean %>%
  separate(species, into = c("Genus", "Species"), sep = "_") %>%
  filter(kraken_assigned_reads > filter)

conc_data_clean_genus_group <- conc_data_clean_sep %>%
  group_by(Genus) %>%
  summarise(n())

plot_stratified_heatmap <- function(){
  for(i in conc_data_clean_genus_group$Genus) {
  print(
    conc_data_clean_sep %>%
      filter(Genus == i) %>% 
      ggplot() + 
      aes(x = sample, y = Species) + 
      geom_tile(aes(fill = new_est_reads), linejoin = "round") + 
      scale_fill_gradientn(colours = c("white", "#F0E442", "#E69F00")) +
      facet_grid(rows = "Genus") + 
      theme_classic(base_size = 25) + 
      theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + 
      xlab("Sample"))
  }
}

pdf(snakemake@output[["bracken_stratified_heatmap"]], height = 20, width = 20)
plot_stratified_heatmap() 
dev.off()

##### Heatmap of Species #####
plot_heatmap_per_sample <- function(data, filter_thresh){
  data %>%
    filter(new_est_reads > filter_thresh) %>%
    select(sample, species, fraction_total) %>%
    ggplot() +
    aes(x = sample, y = species) +
    geom_tile(aes(fill = log(fraction_total))) + 
    scale_fill_gradientn(colours = c("white", "orange", "blue")) + 
    theme_classic() + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5)) + 
    ggtitle(paste("Log Relative Abundance of Microbial Species filter: Filtered > ", filter_thresh, sep = ""))
}

pdf(snakemake@output[["bracken_overall_heatmap"]], height = 20, width = 20)
plot_heatmap_per_sample(conc_data_clean, filter)
dev.off()
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
library(tidyverse)
library(MetBrewer)

# Load Appropriate Columns 
columns <- c("bin", "marker_lineage", "num_genomes", "num_markers", "num_marker_sets", "completeness", "contamination",
             "strain_heterogen", "genome_size_bp", "num_ambiguous_bases", "num_scaffolds", "num_contigs", "N50_scaffold", "N50_contig",
             "mean_scaffold_length", "mean_contig_length", "longest_scaffold", "longest_contig", 
             "GC", "GC_std", "coding_density", "translation_table", "num_predicted_genes", "0", "1", "2", "3", "4", "5+", "sample")

# Read Data, skip first row and remove the redundant header columns from previous step
bin_stats <- read_tsv(snakemake@input[["checkm_conc_result"]], col_names = columns, skip = 1) %>%
  filter(bin != "Bin Id")

# Conver to Numeric Columns
bin_stats$completeness <- as.numeric(bin_stats$completeness)
bin_stats$contamination <- as.numeric(bin_stats$contamination)
bin_stats$genome_size_bp <- as.numeric(bin_stats$genome_size_bp)
bin_stats$num_contigs <- as.numeric(bin_stats$num_contigs)
bin_stats$mean_contig_length <- as.numeric(bin_stats$mean_contig_length)
bin_stats$N50_contig <- as.numeric(bin_stats$N50_contig)

# Get breaks for plotting 
bin_stats$col <- cut(bin_stats$completeness, 
                        breaks = c(-Inf, 80, Inf), 
                        labels = c("<80", ">=80"))

bin_stats$contig_cut <- cut(bin_stats$num_contigs, 
                     breaks = c(-Inf, 250, Inf), 
                     labels = c("<=250", ">250"))

# Plot Completeness Vs Contamination Per Sample
CompletenessVsContam <- bin_stats %>%
  ggplot() + 
  aes(x = completeness, y = contamination, size = genome_size_bp, col = col) %>%
  geom_point(alpha = 0.5) + 
  scale_x_continuous(limits = c(0, 100)) +
  theme_minimal() + 
  scale_color_manual(values = c("blue", "orange"))

pdf(snakemake@output[["CompletenessVsContam"]])
print(CompletenessVsContam)
dev.off()

#### Plot Completeness Vs Contamination Per Sample
bin_stats_sample_split <- bin_stats %>%
  separate(bin, into = c("sample", "bin_num"), sep = "\\.")

compvscontam_samp <- bin_stats_sample_split %>%
  ggplot() + 
  aes(x = completeness, y = contamination, col = sample) %>%
  geom_point(alpha = 0.8, size = 4) + 
  scale_x_continuous(limits = c(0, 100 )) +
  geom_vline( xintercept = 80, linetype = "dotted", 
              color = "green", size = 1) + 
  geom_vline(xintercept = 50, linetype = "dotted", 
             color = "orange", size = 1) + 
  theme_bw(base_size = 18) + 
  scale_color_manual(values = met.brewer("Redon", 10, type = "discrete")) + 
  xlab("Completeness") + 
  ylab("Contamination") + 
  labs(colour = "Sample") + 
  theme(legend.position = "bottom", 
        legend.direction = "horizontal", 
        legend.text = element_text(size = 10))

pdf(snakemake@output[["CompletenessVsContam_Per_Sample"]])
print(compvscontam_samp)
dev.off()

#### Plot Bar Chart with Completeness vs Contamination Per Sample
bin_stats_bar <- bin_stats_sample_split %>%
  unite("sample_bin", sample:bin_num) %>%
  select(completeness, contamination, sample_bin) %>%
  pivot_longer(c(completeness, contamination), 
              names_to = "bin_rank", 
              values_to = "value") %>%
  ggplot() + 
  aes(reorder(x = sample_bin, desc(value)), y = value, fill = bin_rank) + 
  geom_bar(stat = "identity", position = "dodge") + 
  theme_bw(base_size = 15) + 
  coord_flip() + 
  scale_fill_manual(values = met.brewer("Redon", 2, type = "discrete"), labels = c("Completeness", "Contamination")) + 
  geom_hline(yintercept = 80, linetype = "dotted", 
              color = "green", size = 1) + 
  geom_hline(yintercept = 50, linetype = "dotted", 
             color = "orange", size = 1) + 
  xlab("Sample") + 
  ylab("Contamination and Completion Score (%)") + 
  labs(fill = "CheckM Completion vs Contamination Score") + 
  theme(legend.position = "bottom", 
        legend.direction = "vertical")

pdf(snakemake@output[["BarChartCompletenessContamination"]])
print(bin_stats_bar)
dev.off()

#### Plot the number of contigs vs completeness
NumContigsVsCompleteness <- bin_stats %>%
  ggplot() + 
  aes(x = num_contigs, y = completeness, size = mean_contig_length, col = contig_cut) + 
  geom_point(alpha = 0.5) + 
  theme_minimal() + 
  scale_color_manual(values = c("blue", "orange"))

pdf(snakemake@output[["NumContigsVsCompleteness"]])
print(NumContigsVsCompleteness)
dev.off()
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
library(tidyverse)
library(MetBrewer)

#### Import Kraken Data
concatenated_kraken_summaries <- read_tsv(snakemake@input[["concatenated_kraken_summary"]], 
                                            col_names = c("proportion_of_reads", 
                                                        "number_of_reads", 
                                                        "number_of_reads2",
                                                        "read_code", 
                                                        "read_code2", 
                                                        "classification_status", 
                                                        "file_path"))

concatenated_kraken_summaries_clean <- concatenated_kraken_summaries %>%
  separate(file_path, into = c("junk", "filename"), sep = "classified_summary/") %>%
  separate(filename, into = c("sample", "junk2"), sep = "_classified_summary.txt") %>%
  select(sample, proportion_of_reads, number_of_reads, classification_status) 


#### Plot Proportion of Classified vs Unclassified Reads from Kraken
plot_kraken_proportion <- function(kraken_table){

  concatenated_kraken_summaries_clean_plot <- kraken_table %>%
    ggplot() + 
    aes(x = sample, y = number_of_reads, fill = classification_status) + 
    geom_bar(stat = "identity", position = "fill") + 
    theme_bw(base_size = 15) + 
    scale_fill_manual(values = met.brewer("Redon", n = 2, type = "continuous"), 
                      labels = c("Classified Reads", "Unclassified Reads")) + 
    xlab("Sample") + 
    ylab("Proportion of Reads") + 
    theme(legend.title = element_blank(), 
          axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1), 
          legend.position = "top")

  return(concatenated_kraken_summaries_clean_plot)

}

kraken_classification_proportions <- plot_kraken_proportion(concatenated_kraken_summaries_clean)

#### Save file
ggsave(snakemake@output[["classified_proportions"]], kraken_classification_proportions)
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
library(tidyverse)
library(MetBrewer)

##### Import Data #####

spp_tbl <- read_tsv(snakemake@input[["spp_table"]])
genus_tbl <- read_tsv(snakemake@input[["genus_table"]]) 

##### Define User Read Threshold #####
filter_threshold <- as.character(snakemake@params[["strat_thresh"]])
filter_threshold <- as.numeric(filter_threshold)

genus_thres <- as.character(snakemake@params[["genus_thresh"]])
genus_thres <- as.numeric(genus_thres)

spp_thres <- as.character(snakemake@params[["species_thresh"]])
spp_thres <- as.numeric(spp_thres)

##### Clean and Prep Data #####
genus_tbl_clean <- genus_tbl %>%
  separate(study, into = c("sample", "junk"), sep = "_") %>%
  select(!junk) %>%
  rename("phylo_level" = genus)

spp_tbl_clean <- spp_tbl %>%
  separate(study, into = c("sample", "junk"), sep = "_") %>%
  select(!junk) %>%
  rename("phylo_level" = species)

##### Plot Function #####

kraken_heatmap <- function(clean_kraken_table, read_thresh, type){
  heatmap <- clean_kraken_table %>%
    filter(read_number > read_thresh) %>%
    filter(phylo_level != "Homo_sapiens") %>%
    ggplot() + 
    aes(x = sample, y = phylo_level) + 
    geom_tile(aes(fill = log(read_number)), linejoin = "round") + 
    scale_fill_gradientn(colours = met.brewer("Hiroshige", 4, type = "discrete")) +
    theme_bw(base_size = 20) +
    theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1), 
          legend.position = "top", 
          legend.text = element_text(size = 15), 
          legend.title = element_text(size = 15)) + 
    labs(fill = "Read Number (log)") + 
    xlab("Sample") + 
    ylab(type)

  return(heatmap)

}

species_heatmap <- kraken_heatmap(spp_tbl_clean, spp_thres, "Species")
genus_heatmap <- kraken_heatmap(genus_tbl_clean, genus_thres, "Genus")

ggsave(snakemake@output[["genus_heatmap"]], genus_heatmap, height = 20, width = 15)
ggsave(snakemake@output[["species_heatmap"]], species_heatmap, height = 20, width = 15)

##### Faceted Plot By Genus #####
spp_tbl_split <- spp_tbl_clean %>% 
  separate(phylo_level, into = c("Genus", "Species1", "Species2"), sep = "_") %>%
  unite("Species", Species1:Species2, na.rm = TRUE) 

genus_list <- spp_tbl_split %>%
  group_by(Genus) %>%
  summarise(n()) 

plot_species_facets <- function() {
  for(i in genus_list$Genus) {
    print(
      spp_tbl_split %>%
        filter(Genus == i) %>% 
        ggplot() + 
        aes(x = sample, y = Species) + 
        geom_tile(aes(fill = read_number), linejoin = "round") + 
        scale_fill_gradientn(colours = c("white", "#F0E442", "#E69F00")) +
        facet_grid(rows = "Genus") + 
        theme_classic(base_size = 25) + 
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
        )
    }
}

pdf(snakemake@output[["stratified_heatmap"]], height = 20, width = 20)
plot_species_facets()
dev.off()
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
library(tidyverse)
library(MetBrewer)
library(treemapify)

##### Import Data #####

spp_tbl <- read_tsv(snakemake@input[["spp_table"]])
genus_tbl <- read_tsv(snakemake@input[["genus_table"]]) 

##### Define User Read Threshold #####
filter_threshold <- as.character(snakemake@params[["strat_thresh"]])
filter_threshold <- as.numeric(filter_threshold)

genus_thres <- as.character(snakemake@params[["genus_thresh"]])
genus_thres <- as.numeric(genus_thres)

spp_thres <- as.character(snakemake@params[["species_thresh"]])
spp_thres <- as.numeric(spp_thres)

##### Clean and Prep Data #####
genus_tbl_clean <- genus_tbl %>%
  separate(study, into = c("sample", "junk"), sep = "_") %>%
  select(!junk) %>%
  rename("phylo_level" = genus)

spp_tbl_clean <- spp_tbl %>%
  separate(study, into = c("sample", "junk"), sep = "_") %>%
  select(!junk) %>%
  rename("phylo_level" = species)

##### Plot Spatial Plots #####
plot_spatial_classification <- function(classificaiton_tbl, read_thresh){
  tbl_clean_sum <- classificaiton_tbl %>%
    group_by(phylo_level) %>%
    summarise(summed_read_num = sum(read_number)) %>%
    filter(summed_read_num > read_thresh)

  spatial_plot <- tbl_clean_sum %>%
    filter(phylo_level != "Homo_sapiens") %>%
    filter(summed_read_num > read_thresh) %>%
    ggplot(aes(area = summed_read_num, fill = summed_read_num, label = phylo_level, subgroup = phylo_level)) +
    geom_treemap() + 
    geom_treemap_subgroup_border(colour = "white", size = 1) + 
    geom_treemap_text(colour = "white", place = "center", reflow = T) + 
    scale_fill_gradientn(colours = met.brewer("Hiroshige", 10, type = "discrete"), name = "Total Read Number") +
    theme_minimal(base_size = 15) + 
    theme(legend.position = "right")

  return(spatial_plot)
}

species_spatial_plot <- plot_spatial_classification(spp_tbl_clean, spp_thres)
genus_spatial_plot <- plot_spatial_classification(genus_tbl_clean, genus_thres)

ggsave(snakemake@output[["genus_spatial_plot"]], genus_spatial_plot, height = 20, width = 15)
ggsave(snakemake@output[["species_spatial_plot"]], species_spatial_plot, height = 20, width = 15)
tool / cran

tidyverse

Easily Install and Load the 'Tidyverse': The 'tidyverse' is a set of packages that work in harmony because they share common data representations and 'API' design. This package is designed to make it easy to install and load multiple 'tidyverse' packages in a single step. Learn more about the 'tidyverse' at .