Workflow Steps and Code Snippets

124 tagged steps and code snippets that match keyword Consensus

Bioinformatics pipeline for processing 16s marker gene metagenomic sequence data.

library(dada2)
#packageVersion("dada2")
library(tidyverse)

## Set variables
list_of_filenames <- snakemake@input$listfiles
tax_ref <- snakemake@config$dada2_tax_ref
spe_ref <- snakemake@config$dada2_spe_ref
bacterial <- snakemake@config$bacterial

## Make a vector of sample names
samples <- scan(list_of_filenames, what = "character")

out <- readRDS(snakemake@input$filt_out)


## file names of forward and reverse reads, after quality filtering
filtered_forward_reads <- file.path("output/temp/filtered", paste0(samples, "_r1_filtered.fastq.gz"))
filtered_reverse_reads <- file.path("output/temp/filtered", paste0(samples, "_r2_filtered.fastq.gz"))


####### Step 2: Generate error model of data

err_forward <- learnErrors(filtered_forward_reads, multithread = TRUE)
err_reverse <- learnErrors(filtered_reverse_reads, multithread = TRUE)



####### Step 3: Derepliate sequences

derep_forward <- derepFastq(filtered_forward_reads, verbose = TRUE)
derep_reverse <- derepFastq(filtered_reverse_reads, verbose = TRUE)

names(derep_forward) <- samples
names(derep_reverse) <- samples



####### Step 4: Infer sequence variants

dadaF <- dada(derep_forward, err = err_forward, multithread = TRUE)
dadaR <- dada(derep_reverse, err = err_reverse, multithread = TRUE)



####### Step 5: Merge forward and reverse reads
merged <- mergePairs(dadaF, derep_forward, dadaR, derep_reverse, verbose = TRUE)



####### Step 6: Generate count table
seqtab <- makeSequenceTable(merged)




####### Step 7: Remove chimeras
seqtab.nochim <- removeBimeraDenovo(seqtab, method = "consensus", multithread = TRUE)


## How are the reads concentrated in the merged sequence lengths
readsbyseqlen <- tapply(colSums(seqtab.nochim), nchar(colnames(seqtab.nochim)),sum)
pdf(file.path("output", "merged_sequence_lengths.pdf"))
plot(as.integer(names(readsbyseqlen)),readsbyseqlen, xlab = "Merged length", ylab = "Total reads")
dev.off()


####### Step 7b: Track reads throughout processing

getN <- function(x) sum (getUniques(x))
summary_tab <- data.frame(row.names=samples, Input=out[,1], Filtered=out[,2], Denoised=sapply(dadaF, getN), Merged=sapply(merged, getN), Non.Chimeric=rowSums(seqtab.nochim), Total.Perc.Remaining = round(rowSums(seqtab.nochim)/out[,1]*100,1))


pdf(file.path("output", "reads_remaining.pdf"))
hist(summary_tab$Total.Perc.Remaining, col = "blue", breaks = 50, main = "Total", xlab = "Percentage reads remaining")
dev.off()

## Write this table to output
write.table(summary_tab, file.path("output", "reads_tracked.txt"))

summary_tab$Sample <- rownames(summary_tab) 
summary_tab <- summary_tab %>% separate(Sample, c("Sample", "temp"), sep = "_S") 
summary_tab$Sample <- factor(summary_tab$Sample, levels = summary_tab$Sample[order(summary_tab$Non.Chimeric)])
summary_tab_long <- summary_tab %>% gather("QC.Step", "Reads", Input:Non.Chimeric)
summary_tab_long$QC.Step <- factor(summary_tab_long$QC.Step, levels = c("Input", "Filtered", "Denoised", "Merged", "Non.Chimeric"))


gg <- ggplot(summary_tab_long, aes(x = Sample, y = Reads, color = QC.Step)) +
	geom_point(size = 2) +
	scale_color_manual(values = rainbow(5, v = 0.8)) +
	theme_bw() +
	theme(axis.text.x = element_text(angle = 90))
pdf(file.path("output", "read_tracking.pdf"))
gg
dev.off()



####### Step 8: Assign Taxonomy
################################################
## To do: add parameter to config file to allow user to decide whether or not to allow multiples
################################################
taxa <- assignTaxonomy(seqtab.nochim, tax_ref, multithread = TRUE)

## Add Species (if 16S, not if ITS)
if (bacterial == T){
	taxa <- addSpecies (taxa, spe_ref, allowMultiple = TRUE)
}	


####### Step 9: Save output

saveRDS(seqtab.nochim, file.path("output", "seqtab_final.rds"))
saveRDS(taxa, file.path("output", "taxa_final.rds"))

R ggplot2 tidyverse Consensus dada2 From line 8 of scripts/run_dada2.R

Analysis code for the TAP-seq manuscript.

# create consensus gRNA sequence for every detected (perturbations) gRNA for each cell
gRNA_consensus_chr8 <- mm_perts8 %>% 
  group_by(cell_barcode, vector_id) %>% 
  summarize(grna_consensus_seq = consensusString(DNAStringSet(grna_seq)))

gRNA_consensus_chr11 <- mm_perts11 %>% 
  group_by(cell_barcode, vector_id) %>% 
  summarize(grna_consensus_seq = consensusString(DNAStringSet(grna_seq)))

# add expected gRNA sequence
gRNA_consensus_chr8 <- gRNA_consensus_chr8 %>% 
  left_join(select(gRNAs8, -vector_seq), by = "vector_id")

gRNA_consensus_chr11 <- gRNA_consensus_chr11 %>% 
  left_join(select(gRNAs11, -vector_seq), by = "vector_id")

R Markdown Consensus From line 133 of analyses/screen_grna_synthesis_errors.Rmd

# function to compute edit distance between consensus and expected gRNA sequences
calc_edit_dist <- function(x, y, method = "levenshtein") {
  as.integer(stringDist(c(x, y), method = method))
}

# calculate edit distance between sequenced consensus gRNA and expected gRNA sequence
gRNA_consensus_chr8 <- gRNA_consensus_chr8 %>% 
  rowwise() %>% 
  mutate(edit_dist = calc_edit_dist(grna_consensus_seq, grna_seq, method = "levenshtein"))

gRNA_consensus_chr11 <- gRNA_consensus_chr11 %>% 
  rowwise() %>% 
  mutate(edit_dist = calc_edit_dist(grna_consensus_seq, grna_seq, method = "levenshtein"))

noerr_chr8 <- mean(gRNA_consensus_chr8$edit_dist == 0)
noerr_chr11 <- mean(gRNA_consensus_chr11$edit_dist == 0)

R Markdown Consensus From line 154 of analyses/screen_grna_synthesis_errors.Rmd

# create one table
gRNA_cons <- bind_rows(Chr8 = gRNA_consensus_chr8, Chr11 = gRNA_consensus_chr11, .id = "Sample")

# create histogram of edit distance distribution
ggplot(gRNA_cons, aes(edit_dist, fill = Sample)) +
  geom_histogram(position = "dodge", bins = 20) +
  labs(x = "Edit distance\n(gRNA consensus sequence - template)",
       y = "gRNA - cell combinations") +
  scale_fill_manual(values = c(Chr11 = "indianred3", Chr8 = "steelblue")) +
  theme_bw() +
  theme(panel.grid = element_blank(), aspect.ratio = 1,
        text = element_text(size = 15))

# save plot to .pdf file
ggsave(filename = here("results/plots", "screen_gRNA_errors.pdf"), height = 4, width = 5)

R Markdown ggplot2 Consensus From line 173 of analyses/screen_grna_synthesis_errors.Rmd

Obtain unbiased SNPs after FASTQ files bacoded for UMI

__author__ = "Aurora Campo"
__copyright__ = "Copyright 2021, Aurora Campo, modified from git"
__git__= "https://github.com/johanzi/make_pseudogenome"
__email__ = "ayla.bcn@gmail.com"
__license__ = "MIT"


from snakemake.shell import shell
#from snakemake_wrapper_utils.java import get_java_opts

input = snakemake.input.get("vcf")
extra = snakemake.params.get("extra")
ref = snakemake.input.get("ref")
pseudo = snakemake.output.get("pseudo")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)


shell(
    "bcftools consensus "
    "{input} "
    "--sample {extra} "
    "--fasta-ref {ref} "
    "> {pseudo} "
    "{log}"
)

Python Snakemake BCFtools Consensus From line 3 of scripts/pseudogenomes.py

Integrated Mapping and Profiling of Allelically-expressed Loci with Annotations (1.0.0)

shell:
	"""
	cat {input.ref_promoter} | bcftools consensus {input.phase_vcf} -H {wildcards.num}pIu > {output} 2> {log}
	"""

SnakeMake BCFtools Consensus From line 589 of master/Snakefile

A tool for generating bacterial genomes from metagenomes with nanopore long read sequencing

shell: """
    medaka consensus {input[1]} {output} --model r941_flip213 --threads {threads} --regions {wildcards.range}
    """

SnakeMake Consensus From line 379 of master/Snakefile

shell: """
    bcftools consensus -f {input} -o {output}
    """

SnakeMake BCFtools Consensus From line 559 of master/Snakefile

A straightforward bioinformatic pipeline for detecting a bacterial strain in one or more metagenome(s).

import sys
import gzip
import re

DEBUG = 0

ASCII_OFFSET = ord("!")
INDEL_PATTERN = re.compile(r"[+-](\d+)")
INDEL_STRING = "[+-]INTEGER[ACGTN]{INTEGER}"
ROUND = 3  # places to right of decimal point

min_coverage, min_proportion, min_qual = snakemake.params
pileup_file = snakemake.input[0]
out_file = snakemake.output[0]

min_coverage = int(min_coverage)
min_proportion = float(min_proportion)
min_qual = int(min_qual)

## function to process each line of pileup

def parse_pileup(line):

    # read fields from line of pileup
    chromosome, position, reference, coverage, pileup, quality = line.rstrip("\r\n").split("\t")

    # proportion is 0 if no consensus base, top base is N by default
    parsed = {
        "proportion": 0.0,
        "chromosome": chromosome,
        "position": int(position),
        "reference": reference,
        "coverage": int(coverage),
        "pileup": pileup,
        "quality": quality,
        "top_base": "N"}

    # if the base coverage is below the limit or above our acceptable max, call N
    if parsed["coverage"] < min_coverage:
        return parsed

    # uppercase pileup string for processing
    pileup = pileup.upper()

    # Remove start and stop characters from pileup string
    pileup = re.sub(r"\^.|\$", "", pileup)

    # Remove indels from pileup string
    start = 0

    while True:
        match = INDEL_PATTERN.search(pileup, start)

        if match:
            integer = match.group(1)
            pileup = re.sub(INDEL_STRING.replace("INTEGER", integer), "", pileup)
            start = match.start()
        else:
            break

    # get total base count and top base count
    total = 0
    top_base = "N"
    top_base_count = 0

    # uppercase reference base for comparison
    reference = reference.upper()

    base_counts = {"A": 0, "C": 0, "G": 0, "T": 0}

    quality_length = len(quality)

    for i in range(quality_length):

        # convert ASCII character to phred base quality
        base_quality = ord(quality[i]) - ASCII_OFFSET

        # only count high-quality bases
        if base_quality >= min_qual:

            currBase = pileup[i]

            if currBase in base_counts:
                base_counts[currBase] += 1

            else:
                base_counts[reference] += 1

            total += 1

    parsed["total"] = total

    # find top base
    for base in base_counts:
        if base_counts[base] > top_base_count:
            top_base = base
            top_base_count = base_counts[base]

    # if more that 0 bases processed
    if total > 0:
        prop = top_base_count / float(total)

        if prop >= min_proportion:
            parsed["proportion"] = prop
            parsed["top_base"] = top_base

    return parsed

## process pileup file

# read in input file
if not DEBUG:
    out_file = open(out_file, "w")

with open(pileup_file, "rt") as pileup_file:
    for pileup_file_line in pileup_file:

        parsed = parse_pileup(pileup_file_line)

        proportion = parsed["proportion"]

        # print to output file
        if DEBUG:
            print("\t".join([parsed["chromosome"], str(parsed["position"]), parsed["reference"], parsed["top_base"], str(round(proportion, ROUND)),
                parsed["pileup"], parsed["quality"]]))
        else:
            print("\t".join([parsed["chromosome"], str(parsed["position"]), parsed["reference"], parsed["top_base"], str(round(proportion, ROUND)),
                parsed["pileup"], parsed["quality"]]), file=out_file)

if not DEBUG:
    out_file.close()

Python Consensus From line 5 of scripts/callSNPs.py

Integrating genotypes and phenotypes improves long-term forecasts of seasonal influenza A/H3N2 evolution (revised-submission)

import argparse
from augur.utils import read_node_data, write_json
import Bio.Align
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from collections import Counter
import numpy as np
import pandas as pd


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Calculate consensus sequence for all non-zero frequency strains and the distance of each strain from the resulting consensus.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument("--sequences", required=True, help="node data JSON containing sequences to find a consensus for")
    parser.add_argument("--frequencies", required=True, help="table of strain frequencies at the current timepoint")
    parser.add_argument("--sequence-attribute", default="aa_sequence", help="attribute in node data JSON containing the sequence data to use")
    parser.add_argument("--frequency-attribute", default="kde_frequency", help="attribute in frequency table representing the frequency data to use")
    parser.add_argument("--output", required=True, help="node data JSON with consensus sequence and distances from the consensus per strain")

    args = parser.parse_args()

    # Load sequence data from a node data JSON file.
    node_sequences = read_node_data(args.sequences)

    # Load frequency data.
    frequencies = pd.read_csv(args.frequencies, sep="\t")

    # Select names of strains with non-zero frequencies.
    strains = set(frequencies.query(f"{args.frequency_attribute} > 0.0")["strain"].values)

    # Select sequences for strains with non-zero frequencies.
    sequences = Bio.Align.MultipleSeqAlignment([
        SeqRecord(
            Seq(
                record[args.sequence_attribute],
            ),
            id=strain
        )
        for strain, record in node_sequences["nodes"].items()
        if strain in strains
    ])

    # Output will store the consensus sequence and the distance of each strain
    # to the consensus.
    output = {
        "nodes": {}
    }

    # Calculate the consensus sequence using a majority-rule approach where we
    # take the most common value in each column.
    consensus = "".join(
        Counter(sequences[:, i]).most_common(1)[0][0]
        for i in range(sequences.get_alignment_length())
    )
    output["consensus"] = consensus

    # Calculate the distance of each strain sequence from the consensus.
    consensus_array = np.frombuffer(consensus.encode(), dtype="S1")
    for sequence in sequences:
        sequence_array = np.frombuffer(str(sequence.seq).encode(), dtype="S1")
        distance = int((consensus_array != sequence_array).sum())
        output["nodes"][sequence.id] = {
            "distance_from_consensus": distance
        }

    # Output the results.
    write_json(output, args.output)

Python Pandas numpy Biopython Consensus Augur From line 1 of scripts/distance_from_consensus.py

tool / biotools

Consensus

consensus

The Consensus server aligns a sequence to a structural template using a consensus of 5 different alignment methods. A measure of reliability is produced for each alignment position in order to predict the suitability of regions for comparative modelling.

References: