Code Snippets

Snakemake workflow: rna-seq-kallisto-sleuth

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("sleuth")
library("ggplot2")
library("tidyr")

diffexp <- sleuth_load(snakemake@input[["diffexp_rds"]]) %>% drop_na(pval)

ggplot(diffexp) + geom_histogram(aes(pval), bins = 100)
ggsave(file = snakemake@output[[1]], width = 14)

R ggplot2 tidyr sleuth From line 1 of scripts/plot-diffexp-pval-hist.R

Metagenomic pipeline managed by snakemake (1.0)

args <- commandArgs(trailingOnly = TRUE)

library(dplyr)
library(tidyr)
library(data.table)
fixCollapsed <- function(df){
    colnames(df) <- c("key", "value")
    df <- df %>%
        mutate(key = strsplit(key, "; ")) %>%
        unnest(key)
    df <- df[, c(2, 1)]
    return(df)
}
fixDuplicated <- function(df){
    df <- df %>%
        group_by(key) %>%
        summarise(value = paste(value, collapse = "; "))
    values <- strsplit(df$value, "; ")
    values <- lapply(values, unique)
    values <- sapply(values, paste, collapse = "; ")
    df$value <- values
    return(df)
}
removeUnknown <- function(df){
    idx <- grepl("^-", df$key)
    df <- df[!idx,]
    return(df)
}
df <- fread(args[2], stringsAsFactors = FALSE,
    head = FALSE, nThread = as.integer(args[4]))
df <- as.data.frame(df)
df %>%
    fixCollapsed() %>%
    fixDuplicated() %>%
    removeUnknown() %>%
    fwrite(file = args[1], sep = "\t",
        nThread = as.integer(args[4]))
df <- fread(args[3], stringsAsFactors = FALSE,
    head = FALSE, nThread = as.integer(args[4]))
df <- as.data.frame(df)
df %>%
    fixCollapsed() %>%
    fixDuplicated() %>%
    removeUnknown() %>%
    fwrite(file = args[1], sep = "\t",
        append = TRUE, nThread = as.integer(args[4]))

R dplyr data.table tidyr From line 1 of main/createDictionary.R

Integrated Mapping and Profiling of Allelically-expressed Loci with Annotations (1.0.0)

library(optparse)

## ---------------------------------------------------------------------------
## OPT PARSE
## ---------------------------------------------------------------------------

# options
option_list = list(
  make_option(c("-c", "--chromSize"), type="character",  
              help="chromosome sizes", metavar="character"),
  make_option(c("-p", "--centPos"), type="character", 
              help="centred centromere positions", metavar="character"),
  make_option(c("-n", "--cna"), type="character", 
              help="copy number segment file (preferably condensed)",  metavar="character"),
  make_option(c("-d", "--dmr"), type="character", 
              help="DMR positions", metavar="character"),
  make_option(c("-a", "--ase"), type="character",
              help="ASE summary table file", metavar="character"),
  make_option(c("-g", "--genes"), type="character",
              help="Gene annotation file", metavar="character"),
  make_option(c("-o", "--out"), type="character", 
              help="output file prefix", metavar="character")
)

opt_parser = OptionParser(option_list=option_list)
opt = parse_args(opt_parser)

#Note: these packages need to be installed.
suppressMessages(library(dplyr))
suppressMessages(library(reshape2))
suppressMessages(library(ggplot2))
suppressMessages(library(tidyr))
suppressMessages(library(cowplot))

# chromosome size/position info
#chromSize <- read.delim("/projects/hpv_nanopore_prj/refs/hg38_no_alt_TCGA_HTMCP_HPVs_chromSizes.txt", header = F)
#centPos <-  read.delim("/projects/hpv_nanopore_prj/refs/hg38_centromere_positions_merged.bed", header = F)
#genes <- read.delim("/projects/hpv_nanopore_prj/htmcp/ase/pull_trial/vporter-allelespecificexpression/output/HTMCP.03.06.02058/3_cancer/raw/gene_annotation.bed", header = F)
chromSize <- read.delim(opt$chromSize, header = F)
centPos <-  read.delim(opt$centPos, header = F)
genes <- read.delim(opt$genes, header = F)

## ---------------------------------------------------------------------------
## POSITION CHROMOSOME INFO
## ---------------------------------------------------------------------------

# subset to the main chromosomes
chromSize <- chromSize[chromSize$V1 %in% c(paste0("chr", 1:22), "chrX"),]

# Rename columns to chromosome and size
colnames(chromSize) <- c("chr","size")

# Reorder levels for plotting
chromSize$chr <- factor(chromSize$chr,levels=c(paste0("chr", 1:22), "chrX"))
chromSize <- chromSize[chromSize$chr %in% c(paste0("chr", 1:22), "chrX"),]

# Divide by 1Mb to clean up axis
chromSize$size <- chromSize$size/1000000

# centromere mapping
colnames(centPos) <- c("chr", "start", "end")
centPos$chr <- factor(centPos$chr,levels=c(paste0("chr", 1:22), "chrX"))
centPos$centre <- centPos$start + ((centPos$end - centPos$start)/2)
centPos$centre <- centPos$centre/1000000

## ---------------------------------------------------------------------------
## COPY NUMBER
## ---------------------------------------------------------------------------
if (!is.null(opt$cna) & opt$cna != ""){
  #cna <- read.delim("/projects/hpv_nanopore_prj/htmcp/ploidetect/illumina/Ploidetect-pipeline/ploidetect_out/HTMCP-03-06-02058/A37261_A37189/cna_condensed.txt", header = T)
  cna <- read.delim(opt$cna, header = T)
  cna$chr <- paste0("chr", cna$chr)

  # rearrange to make a bed file
  cna_bed <- cna[,c("chr", "pos", "end", "CN", "zygosity", "A", "B")]

  # categorize copy number
  cna_bed <- cna_bed %>%
    mutate(CN.Status = 
             case_when(
               zygosity == "HOM" ~ "LOH",
               A > B ~ "imbalance",
               TRUE ~ "balance"
             ))

  # Divide by 1Mb for axis
  cna_bed$end <- cna_bed$end/1000000
  cna_bed$pos <- cna_bed$pos/1000000

  # Change to factor and reorder levels
  cna_bed$chr <- factor(cna_bed$chr,levels=c(paste0("chr", 1:22), "chrX"))

  cnaLOH <- cna_bed %>% filter(CN.Status == "LOH")
  cnaGAIN <- cna_bed %>% filter(CN.Status == "imbalance") 
}

## ---------------------------------------------------------------------------
## DIFFERENTIAL METHYLATION
## ---------------------------------------------------------------------------

if (!is.null(opt$dmr) & opt$dmr != ""){
  #dmr <- read.delim("/projects/hpv_nanopore_prj/htmcp/call_integration/output/HTMCP-03-06-02058/methylation/diff_meth.csv", header = T)
  dmr <- read.delim(opt$dmr, header = T)

  # Divide by 1Mb for axis
  dmr$start <- dmr$start/1000000
  dmr$end <- dmr$end/1000000
  dmr$middle <- (dmr$start + dmr$end) / 2

  # Change to factor and reorder levels
  dmr$chr <- factor(dmr$chr, levels=c(paste0("chr", 1:22), "chrX"))

  # count in 1Mb bins
  dmrCount <- data.frame(table(as.factor(paste0(dmr$chr, ":", as.integer(dmr$middle)))))

  # split the chromosome name and bin position
  dmrPlot <- separate(dmrCount, col = Var1, into = c("chr", "pos"), sep = ":", remove = T)

  # scale to fit the plot - i.e. make the maximum width 0.65
  maxDMR <- max(dmrPlot$Freq)
  dmrPlot$percMax <- dmrPlot$Freq/maxDMR
  dmrPlot$percMax <- dmrPlot$percMax * 0.65

  # Change to factor and reorder levels
  dmrPlot$chr <- factor(dmrPlot$chr, levels=c(paste0("chr", 1:22), "chrX"))
  dmrPlot$pos <- as.numeric(dmrPlot$pos)
}

## ---------------------------------------------------------------------------
## ASE GENE HISTOGRAM
## ---------------------------------------------------------------------------

#ase <- read.delim("/projects/hpv_nanopore_prj/htmcp/ase/pull_trial/vporter-allelespecificexpression/output/HTMCP.03.06.02058/summaryTable.tsv", header = T)
ase <- read.delim(opt$ase, header = T)

# filter for ASE genes
ase <- ase[ase$aseResults == "ASE",]

# get gene positions
ase$chr <- genes$V1[match(ase$gene, genes$V4)]
ase$start <- genes$V2[match(ase$gene, genes$V4)]
ase$end <- genes$V3[match(ase$gene, genes$V4)]

# get the middle of the gene for plotting
ase$middle <- (ase$start + ase$end)/2

# get the data frame ready for plotting 
ase <- ase[,c("chr", "start", "end","middle")]
ase <- ase[complete.cases(ase),]

# Divide by 1Mb for axis
ase$middle <- ase$middle/1000000

# count in 1Mb bins
aseCount <- data.frame(table(as.factor(paste0(ase$chr, ":", as.integer(ase$middle)))))

# split the chromosome name and bin position
asePlot <- separate(aseCount[aseCount$Var1 != "NA:NA",], col = Var1, into = c("chr", "pos"), sep = ":", remove = T)

# scale to fit the plot - i.e. make the maximum width 0.65
maxASE <- max(asePlot$Freq)
asePlot$percMax <- asePlot$Freq/maxASE
asePlot$percMax <- asePlot$percMax * 0.65

# Change to factor and reorder levels
asePlot$chr <- factor(asePlot$chr, levels=c(paste0("chr", 1:22), "chrX"))
asePlot$pos <- as.numeric(asePlot$pos)

## ---------------------------------------------------------------------------
## PLOT OPTIONS
## ---------------------------------------------------------------------------

##### CNV AND DMRs AVAILABLE

if (!is.null(opt$dmr) & opt$dmr != "" & !is.null(opt$cna) & opt$cna != ""){
  # legend
  adL <- data.frame(xmin = c(9.7, 9.7, 9.7, 10.3,9.7,10.3,7.1,7.1), xmax = c(10.35, 10.35,9.75,10.35,9.75,10.35,7.26,7.26), ymin = c(240,210,238,238,208,208,235,205), ymax = c(243,213,243,243,213,213,245,215),
                    fill = c("ase","dmr","ase","ase","dmr","dmr", "gain", "loh"))

  adW <- data.frame(x = c(11.5, 11.2,8.25,7.6,10,10), y = c(240,210,240,210,250,220),
                    label = c("ASE Gene Density","DMR Density", "Imbalanced CNV", "LOH", as.character(c(maxASE,maxDMR))))

  # plot
  # chromosomes 1 - 12
  p1 <- ggplot() +
    # chromosome bars
    geom_segment(data = chromSize %>% filter(chr %in% paste0("chr", 1:12)), aes(x = chr, xend = chr, y = 0, yend = size), 
                 lineend = "round", color = "lightgrey", size = 5) +
    # LOH
    geom_rect(data = cnaLOH %>% filter(chr %in% paste0("chr", 1:12)), 
              aes(xmin = as.integer(chr) - 0.08, xmax = as.integer(chr) + 0.08, ymin = pos, ymax = end),
              fill="#94d2bd",size = 0.2) +
    # Imbalanced CNV
    geom_rect(data = cnaGAIN %>% filter(chr %in% paste0("chr", 1:12)), 
              aes(xmin = as.integer(chr) - 0.08, xmax = as.integer(chr) + 0.08, ymin = pos, ymax = end),
              fill="#ee9b00",size = 0.2) +
    # ASE genes
    geom_rect(data = asePlot %>% filter(chr %in% paste0("chr", 1:12)), 
              aes(xmin = as.integer(chr) + 0.1, xmax = (as.integer(chr) + 0.1 + percMax), ymin = pos, ymax = pos+1),
              fill = "#005f73", size = 0.25) +
    # DMRs
    geom_rect(data = dmrPlot %>% filter(chr %in% paste0("chr", 1:12)), 
              aes(xmin = (as.integer(chr) - 0.1 - percMax), xmax = as.integer(chr) - 0.1, ymin = pos, ymax = pos+1),
              fill = "#ae2012", size = 0.25) +
    # centromeres
    geom_point(data = centPos %>% filter(chr %in% paste0("chr", 1:12)), aes(x = chr, y = centre), 
               size = 5, colour = "gray") +
    # legend bars
    geom_rect(data = adL, 
              aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, fill = fill),
              size = 0.25) +
    # legend text
    geom_text(data = adW, 
              aes(x = x, y = y, label = label))+
    scale_fill_manual(values = c("#005f73","#ae2012","#ee9b00","#94d2bd")) +
    ylim(0, 250) +
    theme_classic() +
    theme(text = element_text(size=15),axis.line=element_blank(),
          axis.ticks.x=element_blank(),
          legend.position = "none")+
    labs(x=NULL,y="Chromosome Size (Mb)")

  # chromosomes 13 - 22 + X

  # very annoying but you have to filter all the dataframes or else the factor levels won't match the integer value
  chromSizeFilt <- chromSize %>% filter(chr %in% c(paste0("chr", 13:22), "chrX"))
  chromSizeFilt$chr <- factor(chromSizeFilt$chr,levels=c(paste0("chr", 13:22), "chrX"))
  centPosFilt <- centPos %>% filter(chr %in% c(paste0("chr", 13:22), "chrX"))
  centPosFilt$chr <- factor(centPosFilt$chr,levels=c(paste0("chr", 13:22), "chrX"))
  cnaLOHFilt <- cnaLOH %>% filter(chr %in% c(paste0("chr", 13:22), "chrX"))
  cnaLOHFilt$chr <- factor(cnaLOHFilt$chr,levels=c(paste0("chr", 13:22), "chrX"))
  cnaGAINFilt <- cnaGAIN %>% filter(chr %in% c(paste0("chr", 13:22), "chrX"))
  cnaGAINFilt$chr <- factor(cnaGAINFilt$chr,levels=c(paste0("chr", 13:22), "chrX"))
  asePlotFilt <- asePlot %>% filter(chr %in% c(paste0("chr", 13:22), "chrX"))
  asePlotFilt$chr <- factor(asePlotFilt$chr,levels=c(paste0("chr", 13:22), "chrX"))
  dmrPlotFilt <- dmrPlot %>% filter(chr %in% c(paste0("chr", 13:22), "chrX"))
  dmrPlotFilt$chr <- factor(dmrPlotFilt$chr,levels=c(paste0("chr", 13:22), "chrX"))

  # chromosomes 13-22+X
  p2 <- ggplot() +
    # chromosome bars
    geom_segment(data = chromSizeFilt, aes(x = chr, xend = chr, y = 0, yend = size), 
                 lineend = "round", color = "lightgrey", size = 5) +
    # LOH
    geom_rect(data = cnaLOHFilt, 
              aes(xmin = as.integer(chr) - 0.08, xmax = as.integer(chr) + 0.08, ymin = pos, ymax = end),
              fill="#94d2bd",size = 0.2) +
    # Imbalanced CNV
    geom_rect(data = cnaGAINFilt, 
              aes(xmin = as.integer(chr) - 0.08, xmax = as.integer(chr) + 0.08, ymin = pos, ymax = end),
              fill="#ee9b00",size = 0.2) +
    # ASE genes
    geom_rect(data = asePlotFilt, 
              aes(xmin = as.integer(chr) + 0.1, xmax = (as.integer(chr) + 0.1 + percMax), ymin = pos, ymax = pos+1),
              fill = "#005f73", size = 0.25) +
    # DMRs
    geom_rect(data = dmrPlotFilt, 
              aes(xmin = (as.integer(chr) - 0.1 - percMax), xmax = as.integer(chr) - 0.1, ymin = pos, ymax = pos+1),
              fill = "#ae2012", size = 0.25) +
    # centromeres
    geom_point(data = centPosFilt, aes(x = chr, y = centre), 
               size = 5, colour = "black") +
    ylim(0, 250) +
    theme_classic() +
    theme(text = element_text(size=15),axis.line=element_blank(),
          axis.ticks.x=element_blank(),
          legend.position = "none")+
    labs(x=NULL,y=NULL)

} else if (!is.null(opt$cna) & opt$cna != ""){ ### CNVs BUT NO DMRs
  # legend
  adL <- data.frame(xmin = c(9.7, 9.7, 10.3,7.1,7.1), xmax = c(10.35, 9.75,10.35,7.26,7.26), ymin = c(240,238,238,235,205), ymax = c(243,243,243,245,215),
                    fill = c("ase","ase","ase","gain", "loh"))

  adW <- data.frame(x = c(11.5,8.25,7.6,10), y = c(240,240,210,250),
                    label = c("ASE Gene Density", "Imbalanced CNV", "LOH", as.character(c(maxASE))))

  # plot
  # chromosomes 1 - 12
  p1 <- ggplot() +
    # chromosome bars
    geom_segment(data = chromSize %>% filter(chr %in% paste0("chr", 1:12)), aes(x = chr, xend = chr, y = 0, yend = size), 
                 lineend = "round", color = "lightgrey", size = 5) +
    # LOH
    geom_rect(data = cnaLOH %>% filter(chr %in% paste0("chr", 1:12)), 
              aes(xmin = as.integer(chr) - 0.08, xmax = as.integer(chr) + 0.08, ymin = pos, ymax = end),
              fill="#94d2bd",size = 0.2) +
    # Imbalanced CNV
    geom_rect(data = cnaGAIN %>% filter(chr %in% paste0("chr", 1:12)), 
              aes(xmin = as.integer(chr) - 0.08, xmax = as.integer(chr) + 0.08, ymin = pos, ymax = end),
              fill="#ee9b00",size = 0.2) +
    # ASE genes
    geom_rect(data = asePlot %>% filter(chr %in% paste0("chr", 1:12)), 
              aes(xmin = as.integer(chr) + 0.1, xmax = (as.integer(chr) + 0.1 + percMax), ymin = pos, ymax = pos+1),
              fill = "#005f73", size = 0.25) +
    # centromeres
    geom_point(data = centPos %>% filter(chr %in% paste0("chr", 1:12)), aes(x = chr, y = centre), 
               size = 5, colour = "gray") +
    # legend bars
    geom_rect(data = adL, 
              aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, fill = fill),
              size = 0.25) +
    # legend text
    geom_text(data = adW, 
              aes(x = x, y = y, label = label))+
    scale_fill_manual(values = c("#005f73","#ee9b00","#94d2bd")) +
    ylim(0, 250) +
    theme_classic() +
    theme(text = element_text(size=15),axis.line=element_blank(),
          axis.ticks.x=element_blank(),
          legend.position = "none")+
    labs(x=NULL,y="Chromosome Size (Mb)")

  # chromosomes 13 - 22 + X

  # very annoying but you have to filter all the dataframes or else the factor levels won't match the integer value
  chromSizeFilt <- chromSize %>% filter(chr %in% c(paste0("chr", 13:22), "chrX"))
  chromSizeFilt$chr <- factor(chromSizeFilt$chr,levels=c(paste0("chr", 13:22), "chrX"))
  centPosFilt <- centPos %>% filter(chr %in% c(paste0("chr", 13:22), "chrX"))
  centPosFilt$chr <- factor(centPosFilt$chr,levels=c(paste0("chr", 13:22), "chrX"))
  cnaLOHFilt <- cnaLOH %>% filter(chr %in% c(paste0("chr", 13:22), "chrX"))
  cnaLOHFilt$chr <- factor(cnaLOHFilt$chr,levels=c(paste0("chr", 13:22), "chrX"))
  cnaGAINFilt <- cnaGAIN %>% filter(chr %in% c(paste0("chr", 13:22), "chrX"))
  cnaGAINFilt$chr <- factor(cnaGAINFilt$chr,levels=c(paste0("chr", 13:22), "chrX"))
  asePlotFilt <- asePlot %>% filter(chr %in% c(paste0("chr", 13:22), "chrX"))
  asePlotFilt$chr <- factor(asePlotFilt$chr,levels=c(paste0("chr", 13:22), "chrX"))

  # chromosomes 13-22+X
  p2 <- ggplot() +
    # chromosome bars
    geom_segment(data = chromSizeFilt, aes(x = chr, xend = chr, y = 0, yend = size), 
                 lineend = "round", color = "lightgrey", size = 5) +
    # LOH
    geom_rect(data = cnaLOHFilt, 
              aes(xmin = as.integer(chr) - 0.08, xmax = as.integer(chr) + 0.08, ymin = pos, ymax = end),
              fill="#94d2bd",size = 0.2) +
    # Imbalanced CNV
    geom_rect(data = cnaGAINFilt, 
              aes(xmin = as.integer(chr) - 0.08, xmax = as.integer(chr) + 0.08, ymin = pos, ymax = end),
              fill="#ee9b00",size = 0.2) +
    # ASE genes
    geom_rect(data = asePlotFilt, 
              aes(xmin = as.integer(chr) + 0.1, xmax = (as.integer(chr) + 0.1 + percMax), ymin = pos, ymax = pos+1),
              fill = "#005f73", size = 0.25) +
    # centromeres
    geom_point(data = centPosFilt, aes(x = chr, y = centre), 
               size = 5, colour = "black") +
    ylim(0, 250) +
    theme_classic() +
    theme(text = element_text(size=15),axis.line=element_blank(),
          axis.ticks.x=element_blank(),
          legend.position = "none")+
    labs(x=NULL,y=NULL)

} else if (!is.null(opt$dmr) & opt$dmr != ""){ ## DMRs BUT NO CNV
  # legend
  adL <- data.frame(xmin = c(9.7, 9.7, 9.7, 10.3,9.7,10.3), xmax = c(10.35, 10.35,9.75,10.35,9.75,10.35), ymin = c(240,210,238,238,208,208), ymax = c(243,213,243,243,213,213),
                    fill = c("ase","dmr","ase","ase","dmr","dmr"))

  adW <- data.frame(x = c(11.5, 11.2,10,10), y = c(240,210,250,220),
                    label = c("ASE Gene Density","DMR Density", as.character(c(maxASE,maxDMR))))

  # plot
  # chromosomes 1 - 12
  p1 <- ggplot() +
    # chromosome bars
    geom_segment(data = chromSize %>% filter(chr %in% paste0("chr", 1:12)), aes(x = chr, xend = chr, y = 0, yend = size), 
                 lineend = "round", color = "lightgrey", size = 5) +
    # ASE genes
    geom_rect(data = asePlot %>% filter(chr %in% paste0("chr", 1:12)), 
              aes(xmin = as.integer(chr) + 0.1, xmax = (as.integer(chr) + 0.1 + percMax), ymin = pos, ymax = pos+1),
              fill = "#005f73", size = 0.25) +
    # DMRs
    geom_rect(data = dmrPlot %>% filter(chr %in% paste0("chr", 1:12)), 
              aes(xmin = (as.integer(chr) - 0.1 - percMax), xmax = as.integer(chr) - 0.1, ymin = pos, ymax = pos+1),
              fill = "#ae2012", size = 0.25) +
    # centromeres
    geom_point(data = centPos %>% filter(chr %in% paste0("chr", 1:12)), aes(x = chr, y = centre), 
               size = 5, colour = "black") +
    # legend bars
    geom_rect(data = adL, 
              aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, fill = fill),
              size = 0.25) +
    # legend text
    geom_text(data = adW, 
              aes(x = x, y = y, label = label))+
    scale_fill_manual(values = c("#005f73","#ae2012")) +
    ylim(0, 250) +
    theme_classic() +
    theme(text = element_text(size=15),axis.line=element_blank(),
          axis.ticks.x=element_blank(),
          legend.position = "none")+
    labs(x=NULL,y="Chromosome Size (Mb)")

  # chromosomes 13 - 22 + X

  # very annoying but you have to filter all the dataframes or else the factor levels won't match the integer value
  chromSizeFilt <- chromSize %>% filter(chr %in% c(paste0("chr", 13:22), "chrX"))
  chromSizeFilt$chr <- factor(chromSizeFilt$chr,levels=c(paste0("chr", 13:22), "chrX"))
  centPosFilt <- centPos %>% filter(chr %in% c(paste0("chr", 13:22), "chrX"))
  centPosFilt$chr <- factor(centPosFilt$chr,levels=c(paste0("chr", 13:22), "chrX"))
  asePlotFilt <- asePlot %>% filter(chr %in% c(paste0("chr", 13:22), "chrX"))
  asePlotFilt$chr <- factor(asePlotFilt$chr,levels=c(paste0("chr", 13:22), "chrX"))
  dmrPlotFilt <- dmrPlot %>% filter(chr %in% c(paste0("chr", 13:22), "chrX"))
  dmrPlotFilt$chr <- factor(dmrPlotFilt$chr,levels=c(paste0("chr", 13:22), "chrX"))

  # chromosomes 13-22+X
  p2 <- ggplot() +
    # chromosome bars
    geom_segment(data = chromSizeFilt, aes(x = chr, xend = chr, y = 0, yend = size), 
                 lineend = "round", color = "lightgrey", size = 5) +
    # ASE genes
    geom_rect(data = asePlotFilt, 
              aes(xmin = as.integer(chr) + 0.1, xmax = (as.integer(chr) + 0.1 + percMax), ymin = pos, ymax = pos+1),
              fill = "#005f73", size = 0.25) +
    # DMRs
    geom_rect(data = dmrPlotFilt, 
              aes(xmin = (as.integer(chr) - 0.1 - percMax), xmax = as.integer(chr) - 0.1, ymin = pos, ymax = pos+1),
              fill = "#ae2012", size = 0.25) +
    # centromeres
    geom_point(data = centPosFilt, aes(x = chr, y = centre), 
               size = 5, colour = "black") +
    ylim(0, 250) +
    theme_classic() +
    theme(text = element_text(size=15),axis.line=element_blank(),
          axis.ticks.x=element_blank(),
          legend.position = "none")+
    labs(x=NULL,y=NULL)

} else{ ## NO DMRs OR CNVs
  # legend
  adL <- data.frame(xmin = c(9.7, 9.7, 10.3), xmax = c(10.35,9.75,10.35), ymin = c(240,238,238), ymax = c(243,243,243),
                    fill = c("ase","ase","ase"))

  adW <- data.frame(x = c(11.5,10), y = c(240,250),
                    label = c("ASE Gene Density", as.character(c(maxASE))))

  # plot
  # chromosomes 1 - 12
  p1 <- ggplot() +
    # chromosome bars
    geom_segment(data = chromSize %>% filter(chr %in% paste0("chr", 1:12)), aes(x = chr, xend = chr, y = 0, yend = size), 
                 lineend = "round", color = "lightgrey", size = 5) +
    # ASE genes
    geom_rect(data = asePlot %>% filter(chr %in% paste0("chr", 1:12)), 
              aes(xmin = as.integer(chr) + 0.1, xmax = (as.integer(chr) + 0.1 + percMax), ymin = pos, ymax = pos+1),
              fill = "#005f73", size = 0.25) +
    # centromeres
    geom_point(data = centPos %>% filter(chr %in% paste0("chr", 1:12)), aes(x = chr, y = centre), 
               size = 5, colour = "black") +
    # legend bars
    geom_rect(data = adL, 
              aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, fill = fill),
              size = 0.25) +
    # legend text
    geom_text(data = adW, 
              aes(x = x, y = y, label = label))+
    scale_fill_manual(values = c("#005f73")) +
    ylim(0, 250) +
    theme_classic() +
    theme(text = element_text(size=15),axis.line=element_blank(),
          axis.ticks.x=element_blank(),
          legend.position = "none")+
    labs(x=NULL,y="Chromosome Size (Mb)")

  # chromosomes 13 - 22 + X

  # very annoying but you have to filter all the dataframes or else the factor levels won't match the integer value
  chromSizeFilt <- chromSize %>% filter(chr %in% c(paste0("chr", 13:22), "chrX"))
  chromSizeFilt$chr <- factor(chromSizeFilt$chr,levels=c(paste0("chr", 13:22), "chrX"))
  centPosFilt <- centPos %>% filter(chr %in% c(paste0("chr", 13:22), "chrX"))
  centPosFilt$chr <- factor(centPosFilt$chr,levels=c(paste0("chr", 13:22), "chrX"))
  asePlotFilt <- asePlot %>% filter(chr %in% c(paste0("chr", 13:22), "chrX"))
  asePlotFilt$chr <- factor(asePlotFilt$chr,levels=c(paste0("chr", 13:22), "chrX"))

  # chromosomes 13-22+X
  p2 <- ggplot() +
    # chromosome bars
    geom_segment(data = chromSizeFilt, aes(x = chr, xend = chr, y = 0, yend = size), 
                 lineend = "round", color = "lightgrey", size = 5) +
    # ASE genes
    geom_rect(data = asePlotFilt, 
              aes(xmin = as.integer(chr) + 0.1, xmax = (as.integer(chr) + 0.1 + percMax), ymin = pos, ymax = pos+1),
              fill = "#005f73", size = 0.25) +
    # centromeres
    geom_point(data = centPosFilt, aes(x = chr, y = centre), 
               size = 5, colour = "black") +
    ylim(0, 250) +
    theme_classic() +
    theme(text = element_text(size=15),axis.line=element_blank(),
          axis.ticks.x=element_blank(),
          legend.position = "none")+
    labs(x=NULL,y=NULL)
}


## ---------------------------------------------------------------------------
## PLOT 
## ---------------------------------------------------------------------------

# put them together
plot <- plot_grid(p1, p2, align = "v", axis = "l", nrow = 2)

# save plot
ggsave(plot, filename = paste0(opt$out,".pdf"), width = 10, height = 7, units = "in")

R ggplot2 dplyr tidyr cowplot optparse reshape2 From line 6 of src/karyogramFigure.R

suppressMessages(library(optparse))
suppressMessages(library(dplyr))
suppressMessages(library(reshape2))
suppressMessages(library(prob))
suppressMessages(library(tidyr))
suppressMessages(library(MBASED))
suppressMessages(library(SummarizedExperiment))
suppressMessages(library(stats))
suppressMessages(library(tibble))

## ---------------------------------------------------------------------------
## LOAD INPUT 
## ---------------------------------------------------------------------------

# Make help options
option_list = list(
  make_option(c("-p", "--phase"), type="character", default=NULL,
              help="Phased VCF file (from WhatsHap)", metavar="character"),
  make_option(c("-r", "--rna"), type="character", default=NULL,
              help="Tumour RNA vcf file (from Strelka2)", metavar="character"),
  make_option(c("-o", "--outdir"), type="character", default = "mBASED",
              help="Output directory name", metavar="character"),
  make_option(c("-t", "--threads"), type="integer", default = "mBASED",
              help="Threads used for mbased", metavar="integer")
)

# load in options 
opt_parser <- OptionParser(option_list=option_list)
opt <- parse_args(opt_parser)
out <- opt$outdir
threads <- opt$threads

## ---------------------------------------------------------------------------
## USER FUNCTIONS
## ---------------------------------------------------------------------------

# extract info from a list
list_n_item <- function(list, n){
  sapply(list, `[`, n)
}

# define function to print out the summary of ASE results
summarizeASEResults_1s <- function(MBASEDOutput) {

  geneOutputDF <- data.frame(
    majorAlleleFrequency = assays(MBASEDOutput)$majorAlleleFrequency[,1],
    pValueASE = assays(MBASEDOutput)$pValueASE[,1],
    pValueHeterogeneity = assays(MBASEDOutput)$pValueHeterogeneity[,1])

  geneAllele <- as.data.frame(assays(metadata(MBASEDOutput)$locusSpecificResults)$allele1IsMajor) %>%
    rownames_to_column(var = "rowname") %>%
    dplyr::mutate(gene = unlist(lapply(strsplit(rowname, split = ":"),function(x){x = x[1]}))) %>%
    dplyr::group_by(gene) %>%
    summarise(allele1IsMajor = unique(mySample))

  geneOutputDF$allele1IsMajor <- geneAllele$allele1IsMajor[match(rownames(geneOutputDF), geneAllele$gene)]

  lociOutputGR <- rowRanges(metadata(MBASEDOutput)$locusSpecificResults)
  lociOutputGR$allele1IsMajor <- assays(metadata(MBASEDOutput)$locusSpecificResults)$allele1IsMajor[,1]
  lociOutputGR$MAF <- assays(metadata(MBASEDOutput)$locusSpecificResults)$MAF[,1]
  lociOutputList <- split(lociOutputGR, factor(lociOutputGR$aseID, levels=unique(lociOutputGR$aseID)))

  return(
    list(
      geneOutput=geneOutputDF,
      locusOutput=lociOutputList
    )
  )
}

## ---------------------------------------------------------------------------
## READ IN THE RNA SNV CALLS
## ---------------------------------------------------------------------------

# read in the RNA calls
rna_filt <- read.delim(opt$rna, header = T, comment.char = "#", stringsAsFactors = F)
colnames(rna_filt) <- c("CHROM", "POS", "AD","REF","ALT","gene", "gene_biotype") 
rna_filt$variant <- paste0(rna_filt$CHROM, ":", rna_filt$POS)


## ---------------------------------------------------------------------------
## EXTRACT REF/ALT READ COUNTS
## ---------------------------------------------------------------------------

# Extract and add the read counts
expr <- strsplit(rna_filt$AD, ",")
rna_filt$REF.COUNTS <- as.numeric(list_n_item(expr, 1))
rna_filt$ALT.COUNTS <- as.numeric(list_n_item(expr, 2))

## ---------------------------------------------------------------------------
## MBASED WITH OR WITHOUT PHASING
## ---------------------------------------------------------------------------

### WITH PHASING
if (!is.null(opt$phase)){

  ### 
  ### PHASING
  ###

  # WhatsHap phased VCF from ONT sequencing pipeline
  wh <- read.delim(opt$phase, header = F, comment.char = "#", stringsAsFactors = F)
  colnames(wh) <- c("CHROM",  "POS",  "ID",  "REF", "ALT", "QUAL", "FILTER",  "INFO", "FORMAT", "SAMPLE")
  wh$variant <- paste0(wh$CHROM, ":", wh$POS)

  # remove unphased variants - phased variants have the pipe "|" symbol in column 10 - and remove indels
  wh <- wh[grep("|", wh$SAMPLE, fixed=TRUE),]
  wh <- wh %>% dplyr::filter(nchar(REF) == 1 & nchar(ALT) == 1)

  # add genotype from the SAMPLE column as a new column
  info2 <- strsplit(wh$SAMPLE, ":")
  wh$GT <- list_n_item(info2, 1)

  # Add the genotype from WhatsHap 
  rna_filt$GT <- wh$GT[match(rna_filt$variant, wh$variant)]

  # Find unphased genes with one variant (test)
  singleUnphased <- rna_filt %>%
    mutate(phase = variant %in% wh$variant) %>%
    left_join(rna_filt %>% group_by(gene) %>% summarize(n=n())) %>%
    dplyr::filter(!phase & n == 1)

  # Add genotype to unphased gene with one variant (test)
  rna_filt$GT[which(rna_filt$variant %in% singleUnphased$variant)] <- "1|0"

  # annotate the phased variants as alleleA and alleleB
  rna_filt$alleleA <- ifelse(rna_filt$GT == "1|0", rna_filt$ALT, rna_filt$REF)
  rna_filt$alleleB <- ifelse(rna_filt$GT == "1|0", rna_filt$REF, rna_filt$ALT)

  # add the phased COUNTS variants as alleleA and alleleB
  rna_filt$alleleA.counts <- ifelse(rna_filt$GT == "1|0", rna_filt$ALT.COUNTS, rna_filt$REF.COUNTS)
  rna_filt$alleleB.counts <- ifelse(rna_filt$GT == "1|0", rna_filt$REF.COUNTS, rna_filt$ALT.COUNTS)

  # phased only variants
  rna_phased <- rna_filt[complete.cases(rna_filt),]

  # make SNV IDs
  rna_phased <- rna_phased %>%
    arrange(CHROM, POS) %>%
    group_by(gene) %>%
    mutate(label = paste0("SNV",1:n()))
  rna_phased$SNV.ID <- paste0(rna_phased$gene, ":", rna_phased$label)

  ### 
  ### MBASED
  ###

  print("Beginning MBASED ...")

  # make the GRanges object of the loci
  mySNVs <- GRanges(seqnames=rna_phased$CHROM,
                     ranges=IRanges(start=rna_phased$POS, width=1),
                     aseID=rna_phased$gene,
                     allele1=rna_phased$REF,
                     allele2=rna_phased$ALT)
  names(mySNVs) <- rna_phased$SNV.ID

  # create input RangedSummarizedExperiment object
  mySample <- SummarizedExperiment(
    assays=list(lociAllele1Counts=matrix(rna_phased$alleleA.counts,
                                         ncol=1,
                                         dimnames=list(names(mySNVs),'mySample')),
                lociAllele2Counts=matrix(rna_phased$alleleB.counts,
                                         ncol=1,
                                         dimnames=list(names(mySNVs),'mySample'))),
    rowRanges=mySNVs
  )

  # run MBASED
  ASEresults_1s_haplotypesKnown <- runMBASED(ASESummarizedExperiment=mySample,
                                             isPhased=TRUE,
                                             numSim=10^6,
                                             BPPARAM = MulticoreParam(workers = threads))

  saveRDS(ASEresults_1s_haplotypesKnown, file=paste0(out, "/ASEresults_1s_haplotypesKnown.rds"))
  # extract results
  results <- summarizeASEResults_1s(ASEresults_1s_haplotypesKnown)

  # adjust the pvalue with BH correction
  results$geneOutput$padj <- p.adjust(p = results$geneOutput$pValueASE, method = "BH")
  results$geneOutput$significance <- as.factor(ifelse(results$geneOutput$padj < 0.05, "padj < 0.05", "padj > 0.05"))
  results$geneOutput$gene <- rownames(results$geneOutput)

  results$geneOutput$allele1IsMajor[results$geneOutput$gene %in% singleUnphased$gene] = NA

  # add the locus
  results$geneOutput$geneBiotype <- rna_filt$gene_biotype[match(results$geneOutput$gene, rna_filt$gene)]

### WITHOUT PHASING
} else {

  # make SNV labels
  rna_filt <- rna_filt %>%
    arrange(CHROM, POS) %>%
    group_by(gene) %>%
    mutate(label = paste0("SNV",1:n()))
  rna_filt$SNV.ID <- paste0(rna_filt$gene, ":", rna_filt$label)

  ### 
  ### MBASED
  ###

  print("Beginning MBASED ...")

  # make the GRanges object of the loci
  mySNVs <- GRanges(seqnames=rna_filt$CHROM,
                    ranges=IRanges(start=rna_filt$POS, width=1),
                    aseID=rna_filt$gene,
                    allele1=rna_filt$REF,
                    allele2=rna_filt$ALT)
  names(mySNVs) <- rna_filt$SNV.ID

  ## create input RangedSummarizedExperiment object
  mySample <- SummarizedExperiment(
    assays=list(lociAllele1Counts=matrix(rna_filt$REF.COUNTS,
                                         ncol=1,
                                         dimnames=list(names(mySNVs),'mySample')),
                lociAllele2Counts=matrix(rna_filt$ALT.COUNTS,
                                         ncol=1,
                                         dimnames=list(names(mySNVs),'mySample'))),
    rowRanges=mySNVs
  )

  # run MBASED
  ASEresults_1s_haplotypesUnknown <- runMBASED(ASESummarizedExperiment=mySample,
                                               isPhased=FALSE,
                                               numSim=10^6,
                                               BPPARAM = MulticoreParam(workers = threads))
  saveRDS(ASEresults_1s_haplotypesUnknown, file=paste0(out, "/ASEresults_1s_haplotypesUnknown.rds"))

  # extract results
  results <- summarizeASEResults_1s(ASEresults_1s_haplotypesUnknown)

  # adjust the pvalue with BH correction
  results$geneOutput$padj <- p.adjust(p = results$geneOutput$pValueASE, method = "BH")
  results$geneOutput$significance <- as.factor(ifelse(results$geneOutput$padj < 0.05, "padj < 0.05", "padj > 0.05"))
  results$geneOutput$gene <- rownames(results$geneOutput)

  # add the locus
  results$geneOutput$geneBiotype <- rna_filt$gene_biotype[match(results$geneOutput$gene, rna_filt$gene)]

} 

# save the results 
saveRDS(results, file=paste0(out, "/MBASEDresults.rds"))
print("Finished MBASED")

R dplyr tidyr optparse reshape2 tibble SummarizedExperiment MBASED From line 7 of src/mbased.snpEff.R

Robust Optogenetic Inhibition with Red-light-sensitive Anion-conducting Channelrh

library(dplyr)
library(tidyr)
library(bio3d)

template_file <- unlist(snakemake@input)
output_file <- unlist(snakemake@output)

templates <- read.table("pdb/template.txt", col.names = c("ID", "_P_", "fname")) %>%
    mutate(ID = substring(ID, 2))
data <- lapply(templates$fname, read.pdb) %>%
    setNames(templates$ID)
sheets <- lapply(data, `[[`, "sheet") %>%
    lapply(data.frame) %>%
    setNames(templates$ID) %>%
    bind_rows(.id = "ID") %>%
    mutate(sense = ifelse("sense" %in% names(.), as.numeric(sense), NA), sense = ifelse(sense < 0, "-", "+"))
helices <- lapply(data, `[[`, "helix") %>%
    lapply(data.frame) %>%
    setNames(templates$ID) %>%
    bind_rows(.id = "ID")
list(sheet = sheets, helix = helices) %>%
    bind_rows(.id = "ss") %>%
    filter(chain == "A") %>%
    replace_na(list(sense = ".")) %>%
    mutate(source = "SS", score = ".", frame = ".", attrib = ".") %>%
    select(ID, source, ss, start, end, score, sense, frame, attrib) %>%
    write.table(output_file, quote = F, sep = "\t", col.names = F, row.names = F)

R dplyr tidyr Bio3D From line 1 of scripts/features.R

library(dplyr)
library(tidyr)
library(treeio)
library(ggtree)
library(readxl)
library(photobiology)
library(ggplot2)
library(ggnewscale)
library(castor)
library(ape)

with(snakemake@input, {
    tree_file <<- tree
    metadata_file <<- metadata
})

output_file <- unlist(snakemake@output)

to_treedata <- function(tree) {
    class(tree) <- c("tbl_tree", "tbl_df", "tbl", "data.frame")
    as.treedata(tree)
}

add_hsp <- function(tree, colname) {
    colname <- deparse(substitute(colname))
    treedata <- to_treedata(tree)
    categories <- setNames(tree[[colname]], tree[["label"]]) %>%
        `[`(treedata@phylo$tip.label) %>%
        as.factor
    hsp <- hsp_max_parsimony(treedata@phylo, as.numeric(categories), edge_exponent = 0.1) %>%
        `$`("likelihoods") %>%
        as.data.frame %>%
        setNames(levels(categories)) %>%
        mutate(node = 1:n()) %>%
        gather(value, likelihood, -node) %>%
        filter(likelihood > 0.99) %>%
        setNames(c("node", paste0(colname, "_hsp"), paste0(colname, "_hsp_lh")))
    left_join(tree, hsp, by = "node")
}
get_mrca <- function(phylo, tips) {
    getMRCA(phylo, tips) %>%
        replace(is.null(.), NA)
}
add_mrca <- function(tree, colname) {
    colname <- deparse(substitute(colname))
    treedata <- to_treedata(tree)
    mrca <- mutate(tree, my_column = !!as.name(colname)) %>%
        group_by(my_column) %>%
        mutate(is.tip = label %in% treedata@phylo$tip.label) %>%
        mutate(no_data = all(is.na(my_column))) %>%
        mutate(mrca = get_mrca(treedata@phylo, node[is.tip])) %>%
        mutate(mrca = ifelse(no_data | is.na(mrca), node, mrca)) %>%
        group_by(mrca) %>%
        mutate(enough_tips = sum(is.tip) > 1) %>%
        mutate(ifelse(node == mrca & enough_tips, first(na.omit(my_column)), NA)) %>%
        pull
    tree[[paste0(colname, "_mrca")]] <- mrca
    return(tree)
}

metadata <- read_xlsx(metadata_file, .name_repair = "universal") %>%
    mutate(Sequence.name = sub(",.+", "", Sequence.name)) %>%
    mutate(Sequence.name = gsub("@", "_", Sequence.name)) %>%
    mutate(Maximum..nm = ifelse(is.na(Action.maximum..nm), Absorption.maximum..nm, Action.maximum..nm)) 

tree <- read.iqtree(tree_file) %>%
    as_tibble %>%
    left_join(metadata, by = c(label = "Sequence.name")) %>%
    mutate(Color = unname(w_length2rgb(Maximum..nm))) %>%
    mutate(Category = case_when(Currents %in% c("no photocurrents", "channel") ~ NA_character_, T ~ gsub("[][]", "", Currents))) %>%
    mutate(Symbol = gsub(",.+", "", Symbol)) %>%
    mutate(Symbol_show = ifelse(Currents.reference == "[Oppermann23](in_prep)", Symbol, NA)) %>%
    add_mrca(ChR.group) %>%
    add_hsp(Category)

cat_colors <- list(
    "anion channel" = "indianred",
    "cation channel" = "deepskyblue",
    "potassium channel" = "purple",
    "channel" = "yellow4"
)
p <- ggtree(to_treedata(tree), aes(color = Category_hsp), layout = "ape") +
    scale_color_manual(values = cat_colors) + new_scale_color() +
    geom_nodepoint(aes(x = branch.x, y = branch.y, subset = !is.na(UFboot) & UFboot >= 95), size = 0.2, color = "#4d4d4dff") +
    geom_nodepoint(aes(x = branch.x, y = branch.y, subset = !is.na(UFboot) & UFboot >= 90 & UFboot < 95), size = 0.2, color = "#b3b3b3ff") +
    geom_tippoint(aes(subset = !is.na(Category) & is.na(Color)), color = "darkgray") +
    geom_tippoint(aes(subset = !is.na(Color), color = Color)) + scale_colour_identity() + new_scale_color() +
    geom_tiplab2(aes(label = Symbol_show), hjust = -0.2) +
    geom_treescale(width = 0.5) +
    geom_cladelab(mapping = aes(subset = !is.na(ChR.group_mrca), node = node, label = ChR.group_mrca), offset = -0.1) +
    xlim(-10, 10)
ggsave(output_file, p, width = 7, height = 7)

R ggplot2 dplyr tidyr readxl APE ggtree treeio ggnewscale castor photobiology From line 1 of scripts/plot_tree.R

Carotenoid Antenna Workflow for Energy Transfer in Rhodopsin Pumps

library(treeio)
library(ggtree)
library(ape)
library(phytools)
library(dplyr)
library(tidyr)
library(ggplot2)
library(phangorn)
library(ggnewscale)
library(castor)
library(seqinr)

if (interactive()) {
    Snakemake <- setClass("Snakemake", slots = list(input = "list", output = "list"))
    snakemake <- Snakemake(
        input = list(outgroup = "input/outgroups.fasta", tree = "analysis/phylogeny/rhodopsins.treefile", metadata = "analysis/parse/phylogeny.tsv"),
        output = list("tmp.pdf")
    )
}

with(snakemake@input, {
    outgroup_file <<- outgroup
    tree_file     <<- tree
    tsv_file      <<- tsv
    colors_file   <<- colors
    a2m_file      <<- a2m
})
with(snakemake@output, {
    output_file_small <<- small
    output_file_big   <<- big
    output_jtree      <<- jtree
})

taxa <- read.table("metadata/taxa.txt", sep = "\t", comment.char = "") %>%
    arrange(1) %>%
    with(setNames(V2, V1))

outgroups <- names(read.fasta(outgroup_file))
tsv <- read.table(tsv_file, header = T, sep = "\t", na.strings = "", fill = T) %>%
    select(-target)
metadata <- read.fasta(a2m_file, seqtype = "AA", as.string = T) %>%
    {data.frame(label = names(.), sequence = as.character(.))} %>%
    left_join(tsv, by = c(label = "record_id")) %>%
    mutate(is_outgroup = label %in% outgroups) %>%
    mutate(Alias = gsub(",.+", "", Alias)) %>%
    mutate(Activity = ifelse(grepl("\\]$", Activity), NA, gsub("[][]", "", Activity))) %>%
    mutate(Highlight = !is.na(Highlight)) %>%
    mutate(D85 = substr(motif, 1, 1), T89 = substr(motif, 2, 2), D96 = substr(motif, 3, 3), G156 = window)

tree <- read.tree(tree_file)

tree.unrooted <- as_tibble(tree) %>%
    left_join(metadata, by = "label") %>%
    `class<-`(c("tbl_tree", "data.frame")) %>%
    as.treedata
write.jtree(tree.unrooted, file = output_jtree)

tree.tib <- ape::root(tree, outgroups, edgelabel = T, resolve.root = T) %>%
    drop.tip(outgroups) %>%
    as_tibble %>%
    mutate(support = suppressWarnings(as.numeric(label))) %>%
    left_join(metadata, by = "label") %>%
    mutate(is_outgroup = ifelse(label %in% outgroups, T, NA)) %>%
    `class<-`(c("tbl_tree", "data.frame"))
tree.phylo <- as.treedata(tree.tib)@phylo

clustalx <- c(
    A = "BLUE",
    I = "BLUE",
    L = "BLUE",
    M = "BLUE",
    F = "BLUE",
    W = "BLUE",
    V = "BLUE",
    C = "BLUE",
    K = "RED",
    R = "RED",
    E = "MAGENTA",
    D = "MAGENTA",
    N = "GREEN",
    Q = "GREEN",
    S = "GREEN",
    T = "GREEN",
    G = "ORANGE",
    P = "YELLOW",
    H = "CYAN",
    Y = "CYAN"
)

clades <- filter(tree.tib, ! node %in% parent) %>%
    pull(Clade) %>%
    as.factor
hsp <- hsp_max_parsimony(tree.phylo, as.numeric(clades)) %>%
    `$`("likelihoods") %>%
    data.frame(check.names = F) %>%
    mutate(node = row_number()) %>%
    gather(hsp, prob, -node) %>%
    filter(prob > 0.9) %>%
    mutate(hsp = levels(clades)[as.numeric(hsp)])

tree <- left_join(tree.tib, hsp, by = "node") %>%
    mutate(hsp.parent = .[match(parent, node),"hsp"]) %>%
    mutate(hsp = ifelse(!is.na(hsp.parent) & hsp == hsp.parent, NA, hsp)) %>%
    as.treedata

p_small <- ggtree(tree, layout = "circular") +
    geom_highlight(mapping = aes(subset = !is.na(hsp), fill = hsp), alpha = 0.1) +
    geom_treescale() +
    geom_tiplab(mapping = aes(subset = !is.na(Alias), label = Alias, size = Highlight), offset = 0.1) +
    # geom_tippoint(mapping = aes(subset = !is.na(Activity), color = Activity), size = 1) +
    scale_size_manual(values = c(2.5, 5)) + new_scale("size") + # labels
    geom_point2(aes(subset = !is.na(support) & support >= 90, size = support >= 95), color = "darkgray") +
    scale_size_manual(values = c(0.5, 1)) + # support values
    new_scale("color") +
    geom_text2(aes(label = G156, color = G156, angle = angle - 90, x = 4.3), size = 3) +
    scale_color_manual(values = clustalx) # residues

p_big <- ggtree(tree, aes(color = Taxon), layout = "rectangular") +
    geom_highlight(mapping = aes(subset = !is.na(hsp), fill = hsp), alpha = 0.1) +
    scale_color_manual(values = taxa) + new_scale("color") +
    geom_treescale() +
    geom_tiplab(mapping = aes(label = sprintf("%s [%s]", ifelse(is.na(Alias), label, Alias), Organism)), size = 2, offset = 0.1) +
    geom_tippoint(mapping = aes(subset = !is.na(Activity), color = Activity), size = 1) +
    geom_point2(aes(subset = !is.na(support) & support >= 90, size = support >= 95, x = branch), shape = 15, color = "darkgray") +
    scale_size_manual(values = c(0.5, 1)) + # support values
    new_scale("color") +
    geom_text2(aes(label = D85, color = D85, x = 4.8), size = 1.5) +
    geom_text2(aes(label = T89, color = T89, x = 5.0), size = 1.5) +
    geom_text2(aes(label = D96, color = D96, x = 5.2), size = 1.5) +
    geom_text2(aes(label = G156, color = G156, x = 5.4), size = 2) +
    scale_color_manual(values = clustalx) # residues

ggsave(output_file_small, p_small, height = 7, width = 8)
ggsave(output_file_big,   p_big,   height = 6.5, width = 5)

R Snakemake ggplot2 dplyr tidyr APE seqinr ggtree treeio ggnewscale phangorn phytools castor From line 2 of scripts/plot_rhodopsins.R

Gezelvirus Workflow: Studying a Polinton-like Virus in Phaeocystis globosa

suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(stringr))

stdin_fd <- file("stdin")
all.lines <- readLines(stdin_fd)
close(stdin_fd)
param.start <- 1
data.start  <- which(grepl("^ *No Hit", all.lines)) %>% first %>% `+`(1)

align.start <- which(grepl("^No 1", all.lines)) %>% first

param.end <- data.start - 2
data.end  <- align.start - 1
align.end <- length(all.lines)

if (is.na(align.start)) {
    data <- tibble(
        Query           = character(),
        No              =   integer(),
        Hit.ID          = character(),
        Hit.Description = character(),
        Q.ss_pred       = character(),
        Q.query         = character(),
        Q.consensus     = character(),
        Q.Start         =   integer(),
        Q.End           =   integer(),
        Q.Length        =   integer(),
        T.consensus     = character(),
        T.Start         =   integer(),
        T.End           =   integer(),
        T.Length        =   integer(),
        T.hit           = character(),
        T.ss_dssp       = character(),
        T.ss_pred       = character(),
        Aligned_cols    =   integer(),
        E.value         =   numeric(),
        Identities      =   numeric(),
        Probab          =   numeric(),
        Score           =   numeric(),
        Similarity      =   numeric(),
        Sum_probs       =   numeric(),
        Template_Neff   =   numeric()
    )
} else {
    metadata <- data.frame(key = all.lines[param.start:param.end]) %>%
        mutate(value = substr(key, 14, 10000) %>% trimws, key = substr(key, 1, 14) %>% trimws) %>%
        filter(key != "") %>%
        {setNames(.$value, .$key)} %>%
        as.list
    data <- data.frame(Query = sub(" .*", "", metadata$Query), line = all.lines[align.start:align.end], stringsAsFactors = F) %>%
        filter(line != "") %>%
        extract(line, into = c("name", "value"), regex = "([^ ]+) ?(.+)?", remove = F) %>%
        mutate(No = ifelse(name == "No", value, NA) %>% as.integer) %>%
        mutate(Hit.ID = ifelse(substr(name, 1, 1) == ">", substr(name, 2, nchar(.)), NA)) %>%
        mutate(Hit.Description = ifelse(substr(name, 1, 1) == ">", value, NA)) %>%
        mutate(Match = ifelse(grepl("=", name), line, NA)) %>%
        mutate(name = ifelse(grepl("Q Consensus", lag(line)) & grepl("T Consensus", lead(line)), "M", name)) %>%
        mutate(value = ifelse(name == "M", line, value)) %>%
        fill(No) %>%
        group_by(Query, No) %>%
        summarize(
            Hit.ID       = na.omit(Hit.ID) %>% first,
            Hit.Description = na.omit(Hit.Description) %>% first,
            Match        = na.omit(Match) %>% first,
            Q.ss_pred    = value[name == "Q" & grepl("^ss_pred ", value)]         %>% substr(., 16, nchar(.)) %>% paste(collapse = "") %>% gsub(" +", "", .),
            Q.query      = value[name == "Q" & grepl("^Consensus ", lead(value))] %>% substr(., 16, nchar(.)) %>% paste(collapse = " "),
            Q.consensus  = value[name == "Q" & grepl("^Consensus ", value)]       %>% substr(., 16, nchar(.)) %>% paste(collapse = " "),
            T.consensus  = value[name == "T" & grepl("^Consensus ", value)]       %>% substr(., 16, nchar(.)) %>% paste(collapse = " "),
            T.hit        = value[name == "T" & grepl("^Consensus ", lag(value))]  %>% substr(., 16, nchar(.)) %>% paste(collapse = " "),
            T.ss_dssp    = value[name == "T" & grepl("^ss_dssp ", value)]         %>% substr(., 16, nchar(.)) %>% paste(collapse = " ") %>% gsub(" +", "", .),
            T.ss_pred    = value[name == "T" & grepl("^ss_pred ", value)]         %>% substr(., 16, nchar(.)) %>% paste(collapse = "")  %>% gsub(" ", "", .),
            .groups = "drop"
        ) %>%
        extract(Q.consensus, into = c("Q.Start", "Q.End", "Q.Length"), regex = "^ *(\\d+) .+ (\\d+) +[(](\\d+)[)]$", remove = F, convert = T) %>%
        extract(T.consensus, into = c("T.Start", "T.End", "T.Length"), regex = "^ *(\\d+) .+ (\\d+) +[(](\\d+)[)]$", remove = F, convert = T) %>%
        mutate(
            Q.consensus  = gsub("[0-9() ]+", "", Q.consensus),
            Q.query      = gsub("[0-9() ]+", "", Q.query),
            T.consensus  = gsub("[0-9() ]+", "", T.consensus),
            T.hit        = gsub("[0-9() ]+", "", T.hit),
        ) %>%
        #extract(Hit.Description, into = "Hit.Organism",    regex = "[{]([^}]+)[}]",  remove = F) %>%
        #extract(Hit.Description, into = "Hit.Description", regex = "([^;]+)",        remove = F) %>%
        #extract(Hit.Description, into = "Hit.Keywords",    regex = "[^;]+; ([^;]+)", remove = F) %>%
        mutate(Match = str_split(Match, " +")) %>%
        unnest(cols = Match) %>%
        separate(Match, into = c("key", "value"), "=") %>%
        mutate(value = sub("%", "", value) %>% as.numeric) %>%
        spread(key, value) %>%
        rename(E.value = `E-value`) %>%
        mutate(Aligned_cols = as.integer(Aligned_cols))
}

write.table(data, quote = F, sep = "\t", row.names = F)

R dplyr tidyr stringr From line 4 of helpers/parse_hhsuite.R

library(dplyr)
library(tidyr)

with(snakemake@input, {
    clu_tsv_file      <<- clu_tsv
    segment_tsv_files <<- segment_tsv
    virus_tsv_files   <<- virus_tsv
})
with(snakemake@params, {
    coverage <<- coverage
    probab   <<- probab
    coverage_q_frag <<- coverage_q_frag
    coverage_t_frag <<- coverage_t_frag
    identities_frag <<- identities_frag
    probab_frag     <<- probab_frag
})
output_file <- unlist(snakemake@output)

clusters <- read.table(clu_tsv_file, sep = "\t", col.names = c("Cluster", "ID"))

segment_tsv <- lapply(segment_tsv_files, read.table, header = T, sep = "\t", quote = "")
virus_tsv   <- lapply(virus_tsv_files, read.table, header = T, sep = "\t", quote = "")
data <- c(segment_tsv, virus_tsv) %>%
    bind_rows %>%
    mutate(Q.Coverage = (Q.End - Q.Start + 1) / Q.Length * 100, T.Coverage = (T.End - T.Start + 1) / T.Length * 100) %>%
    # filter(Probab >= probab, Q.Coverage >= coverage, T.Coverage >= coverage) %>%
    filter(Probab >= probab, Q.Coverage >= coverage & T.Coverage >= coverage | Q.Coverage >= coverage_q_frag & T.Coverage >= coverage_t_frag & Identities >= identities_frag & Probab >= probab_frag) %>%
    left_join(clusters, by = c(Hit.ID = "Cluster")) %>%
    select(Query, ID, Probab)
write.table(data, output_file, sep = "\t", row.names = F, col.names = F, quote = F)

R dplyr tidyr From line 2 of scripts/abc_graph.R

library(dplyr)
library(tidyr)
library(ape)
library(ggtree)
library(treeio)
library(phangorn)
library(stringr)
library(ggplot2)
library(phytools)

if (interactive()) {
    setClass("snake", slots = list(input = "list", output = "list"))
    snakemake <- new("snake", input  = list(
            tree = "analysis/phylogeny/MCP_NCLDV_epa/epa_result.newick",
            fasta = "analysis/phylogeny/MCP_NCLDV.fasta",
            outgroups    = "metadata/queries/MCP_NCLDV_outgroups.faa",
            synonyms = "metadata/organisms.txt",
            hmm = Sys.glob("hmm_algae/*.hmm")
    ), output = list(
        image = "test.svg",
        jtree = "output/MCP_NCLDV.jtree"
    ))
}

with(snakemake@input, {
    tree_file     <<- tree
    fasta_file    <<- fasta
    synonyms_file <<- synonyms
    outgroup_file <<- outgroups
})
with(snakemake@output, {
    out_image_file <<- image
    out_jtree_file <<- jtree
})
with(snakemake@params, {
    outgroup_rooting <<- outgroup_rooting
})

read.fasta.headers <- function(fnames) {
    file.info(fnames) %>%
        filter(size > 0) %>%
        rownames %>%
        lapply(treeio::read.fasta) %>%
        lapply(names) %>%
        unlist %>%
        data.frame(title = .)
}

synonyms <- read.table(synonyms_file, header = T, sep = "\t", fill = T, na.strings = "") %>%
    mutate(Collapse = ifelse(is.na(Collapse), Name, Collapse))

headers <- read.fasta.headers(fasta_file) %>%
    extract(title, into = c("label", "ID"), regex = "^([^ ]+) ([^ ]+)", remove = F) %>%
    left_join(synonyms, by = "ID")

no_name <- filter(headers, is.na(Name)) %>%
    pull(label) %>%
    paste(collapse = ", ")
if (no_name != "") {
    print(paste("No aliases found for: ", no_name))
    quit(status = 1)
}

tree <- read.tree(tree_file)
tree <- phangorn::midpoint(tree, node.labels = "support")
if (outgroup_rooting) {
    outgroup_df <- read.fasta.headers(outgroup_file)
    outgroups <- with(outgroup_df, sub(" .*", "", title))
    tree <- ape::root(tree, node = MRCA(tree, outgroups), edgelabel = T, resolve.root = T)
}
tree <- as_tibble(tree) %>%
    mutate(support = ifelse(node %in% parent & label != "", label, NA)) %>%
    separate(support, into = c("SH_aLRT", "UFboot"), sep = "/", convert = T) %>%
    left_join(headers, by = "label") %>%
    mutate(label.show = Name) %>%
    mutate(isInternal = node %in% parent) %>%
    `class<-`(c("tbl_tree", "tbl_df", "tbl", "data.frame"))
tree_data <- as.treedata(tree)
write.jtree(tree_data, file = out_jtree_file)

ntaxa <- filter(tree, ! node %in% parent) %>% nrow

colors <- list(
    Haptophyta = "orange",
    Chlorophyta = "green",
    Streptophyta = "darkgreen",
    MAG = "purple",
    Stramenopiles = "brown",
    Cryptophyta = "red",
    Amoebozoa = "gold4",
    Euglenozoa = "yellow",
    Choanoflagellata = "darkslateblue",
    Glaucophyta = "cyan",
    Animals = "blue",
    Dinoflagellata = "gray50",
    Rhizaria = "gray30"
)

scaleClades <- function(p, df) {
    with(df, Reduce(function(.p, .node) {
        offs <- offspring(.p$data, .node)
        scale <- 0.5 / (nrow(offs) - 1)
        scaleClade(.p, .node, scale)
    }, node, p))
}
collapseClades <- function(p, df) {
    with(df, Reduce(function(.p, .node) {
        fill <- unlist(colors[Host[node == .node]])
        .p$data[.p$data$node == .node, "label.show"] <- label.show[node == .node]
        collapse(.p, .node, "mixed", fill = fill)
    }, node, p))
}
#labelClades <- function(p) {
#    with(df, Reduce(function(.p, .node) {
#        .p + geom_cladelab(node = .node, label = label[node == .node], align = T, offset = .2, textcolor = 'blue')
#    }, node, p))
#}

multi_species <- allDescendants(tree_data@phylo) %>%
    lapply(function(x) filter(tree, node %in% x)) %>%
    bind_rows(.id = "ancestor") %>%
    group_by(ancestor) %>%
    filter(n_distinct(Collapse, na.rm = T) == 1, sum(!isInternal) > 1) %>% # , !any(Group == "Haptophyta")) %>%
    ungroup %>%
    mutate(ancestor = as.numeric(ancestor)) %>%
    filter(! ancestor %in% node) %>%
    filter(!is.na(Collapse)) %>%
    group_by(ancestor, Collapse) %>%
    summarize(num_tips = sum(!isInternal), Host = first(na.omit(Host))) %>%
    mutate(label.show = sprintf("%s (%d)", Collapse, num_tips)) %>%
    rename(node = ancestor)
p <- ggtree(tree_data) +
    geom_nodepoint(aes(x = branch, subset = !is.na(UFboot) & UFboot >= 90, size = UFboot)) +
    geom_tiplab(aes(label = label.show), size = 4, align = T, linesize = 0) +
    geom_text2(aes(subset = node %in% multi_species$node, x = max(x, na.rm = T), label = label.show), nudge_x = 0.01, size = 4, hjust = 0) +
    geom_tippoint(aes(color = Host), size = 3) +
    geom_treescale(width = 0.5) +
    scale_size_continuous(limits = c(90, 100), range = c(1, 3)) +
    scale_shape_manual(values = seq(0,15)) +
    scale_color_manual(values = colors)

p <- scaleClades(p, multi_species)
p <- collapseClades(p, multi_species)
# p <- facet_plot(p, mapping = aes(x = as.numeric(as.factor(query.name)), shape = DESC), data = genes, geom = geom_point, panel = 'Genes')

ggsave(out_image_file, p, height = ntaxa * 0.1, width = 7, limitsize = F)

R Snakemake ggplot2 dplyr tidyr stringr APE ggtree treeio phangorn phytools From line 2 of scripts/ggtree.R

Workflow Steps and Code Snippets