Workflow Steps and Code Snippets

124 tagged steps and code snippets that match keyword Consensus

Bioinformatics pipeline for processing 16s marker gene metagenomic sequence data.

  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
library(dada2)
#packageVersion("dada2")
library(tidyverse)

## Set variables
list_of_filenames <- snakemake@input$listfiles
tax_ref <- snakemake@config$dada2_tax_ref
spe_ref <- snakemake@config$dada2_spe_ref
bacterial <- snakemake@config$bacterial

## Make a vector of sample names
samples <- scan(list_of_filenames, what = "character")

out <- readRDS(snakemake@input$filt_out)


## file names of forward and reverse reads, after quality filtering
filtered_forward_reads <- file.path("output/temp/filtered", paste0(samples, "_r1_filtered.fastq.gz"))
filtered_reverse_reads <- file.path("output/temp/filtered", paste0(samples, "_r2_filtered.fastq.gz"))


####### Step 2: Generate error model of data

err_forward <- learnErrors(filtered_forward_reads, multithread = TRUE)
err_reverse <- learnErrors(filtered_reverse_reads, multithread = TRUE)



####### Step 3: Derepliate sequences

derep_forward <- derepFastq(filtered_forward_reads, verbose = TRUE)
derep_reverse <- derepFastq(filtered_reverse_reads, verbose = TRUE)

names(derep_forward) <- samples
names(derep_reverse) <- samples



####### Step 4: Infer sequence variants

dadaF <- dada(derep_forward, err = err_forward, multithread = TRUE)
dadaR <- dada(derep_reverse, err = err_reverse, multithread = TRUE)



####### Step 5: Merge forward and reverse reads
merged <- mergePairs(dadaF, derep_forward, dadaR, derep_reverse, verbose = TRUE)



####### Step 6: Generate count table
seqtab <- makeSequenceTable(merged)




####### Step 7: Remove chimeras
seqtab.nochim <- removeBimeraDenovo(seqtab, method = "consensus", multithread = TRUE)


## How are the reads concentrated in the merged sequence lengths
readsbyseqlen <- tapply(colSums(seqtab.nochim), nchar(colnames(seqtab.nochim)),sum)
pdf(file.path("output", "merged_sequence_lengths.pdf"))
plot(as.integer(names(readsbyseqlen)),readsbyseqlen, xlab = "Merged length", ylab = "Total reads")
dev.off()


####### Step 7b: Track reads throughout processing

getN <- function(x) sum (getUniques(x))
summary_tab <- data.frame(row.names=samples, Input=out[,1], Filtered=out[,2], Denoised=sapply(dadaF, getN), Merged=sapply(merged, getN), Non.Chimeric=rowSums(seqtab.nochim), Total.Perc.Remaining = round(rowSums(seqtab.nochim)/out[,1]*100,1))


pdf(file.path("output", "reads_remaining.pdf"))
hist(summary_tab$Total.Perc.Remaining, col = "blue", breaks = 50, main = "Total", xlab = "Percentage reads remaining")
dev.off()

## Write this table to output
write.table(summary_tab, file.path("output", "reads_tracked.txt"))

summary_tab$Sample <- rownames(summary_tab) 
summary_tab <- summary_tab %>% separate(Sample, c("Sample", "temp"), sep = "_S") 
summary_tab$Sample <- factor(summary_tab$Sample, levels = summary_tab$Sample[order(summary_tab$Non.Chimeric)])
summary_tab_long <- summary_tab %>% gather("QC.Step", "Reads", Input:Non.Chimeric)
summary_tab_long$QC.Step <- factor(summary_tab_long$QC.Step, levels = c("Input", "Filtered", "Denoised", "Merged", "Non.Chimeric"))


gg <- ggplot(summary_tab_long, aes(x = Sample, y = Reads, color = QC.Step)) +
	geom_point(size = 2) +
	scale_color_manual(values = rainbow(5, v = 0.8)) +
	theme_bw() +
	theme(axis.text.x = element_text(angle = 90))
pdf(file.path("output", "read_tracking.pdf"))
gg
dev.off()



####### Step 8: Assign Taxonomy
################################################
## To do: add parameter to config file to allow user to decide whether or not to allow multiples
################################################
taxa <- assignTaxonomy(seqtab.nochim, tax_ref, multithread = TRUE)

## Add Species (if 16S, not if ITS)
if (bacterial == T){
	taxa <- addSpecies (taxa, spe_ref, allowMultiple = TRUE)
}	


####### Step 9: Save output

saveRDS(seqtab.nochim, file.path("output", "seqtab_final.rds"))
saveRDS(taxa, file.path("output", "taxa_final.rds"))

Analysis code for the TAP-seq manuscript.

133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# create consensus gRNA sequence for every detected (perturbations) gRNA for each cell
gRNA_consensus_chr8 <- mm_perts8 %>% 
  group_by(cell_barcode, vector_id) %>% 
  summarize(grna_consensus_seq = consensusString(DNAStringSet(grna_seq)))

gRNA_consensus_chr11 <- mm_perts11 %>% 
  group_by(cell_barcode, vector_id) %>% 
  summarize(grna_consensus_seq = consensusString(DNAStringSet(grna_seq)))

# add expected gRNA sequence
gRNA_consensus_chr8 <- gRNA_consensus_chr8 %>% 
  left_join(select(gRNAs8, -vector_seq), by = "vector_id")

gRNA_consensus_chr11 <- gRNA_consensus_chr11 %>% 
  left_join(select(gRNAs11, -vector_seq), by = "vector_id")
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# function to compute edit distance between consensus and expected gRNA sequences
calc_edit_dist <- function(x, y, method = "levenshtein") {
  as.integer(stringDist(c(x, y), method = method))
}

# calculate edit distance between sequenced consensus gRNA and expected gRNA sequence
gRNA_consensus_chr8 <- gRNA_consensus_chr8 %>% 
  rowwise() %>% 
  mutate(edit_dist = calc_edit_dist(grna_consensus_seq, grna_seq, method = "levenshtein"))

gRNA_consensus_chr11 <- gRNA_consensus_chr11 %>% 
  rowwise() %>% 
  mutate(edit_dist = calc_edit_dist(grna_consensus_seq, grna_seq, method = "levenshtein"))

noerr_chr8 <- mean(gRNA_consensus_chr8$edit_dist == 0)
noerr_chr11 <- mean(gRNA_consensus_chr11$edit_dist == 0)
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# create one table
gRNA_cons <- bind_rows(Chr8 = gRNA_consensus_chr8, Chr11 = gRNA_consensus_chr11, .id = "Sample")

# create histogram of edit distance distribution
ggplot(gRNA_cons, aes(edit_dist, fill = Sample)) +
  geom_histogram(position = "dodge", bins = 20) +
  labs(x = "Edit distance\n(gRNA consensus sequence - template)",
       y = "gRNA - cell combinations") +
  scale_fill_manual(values = c(Chr11 = "indianred3", Chr8 = "steelblue")) +
  theme_bw() +
  theme(panel.grid = element_blank(), aspect.ratio = 1,
        text = element_text(size = 15))

# save plot to .pdf file
ggsave(filename = here("results/plots", "screen_gRNA_errors.pdf"), height = 4, width = 5)

Obtain unbiased SNPs after FASTQ files bacoded for UMI

 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
__author__ = "Aurora Campo"
__copyright__ = "Copyright 2021, Aurora Campo, modified from git"
__git__= "https://github.com/johanzi/make_pseudogenome"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell
#from snakemake_wrapper_utils.java import get_java_opts

input = snakemake.input.get("vcf")
extra = snakemake.params.get("extra")
ref = snakemake.input.get("ref")
pseudo = snakemake.output.get("pseudo")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)


shell(
    "bcftools consensus "
    "{input} "
    "--sample {extra} "
    "--fasta-ref {ref} "
    "> {pseudo} "
    "{log}"
)

Integrated Mapping and Profiling of Allelically-expressed Loci with Annotations (1.0.0)

589
590
591
592
shell:
	"""
	cat {input.ref_promoter} | bcftools consensus {input.phase_vcf} -H {wildcards.num}pIu > {output} 2> {log}
	"""

A tool for generating bacterial genomes from metagenomes with nanopore long read sequencing

379
380
381
shell: """
    medaka consensus {input[1]} {output} --model r941_flip213 --threads {threads} --regions {wildcards.range}
    """
559
560
561
shell: """
    bcftools consensus -f {input} -o {output}
    """

A straightforward bioinformatic pipeline for detecting a bacterial strain in one or more metagenome(s).

  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import sys
import gzip
import re

DEBUG = 0

ASCII_OFFSET = ord("!")
INDEL_PATTERN = re.compile(r"[+-](\d+)")
INDEL_STRING = "[+-]INTEGER[ACGTN]{INTEGER}"
ROUND = 3  # places to right of decimal point

min_coverage, min_proportion, min_qual = snakemake.params
pileup_file = snakemake.input[0]
out_file = snakemake.output[0]

min_coverage = int(min_coverage)
min_proportion = float(min_proportion)
min_qual = int(min_qual)

## function to process each line of pileup

def parse_pileup(line):

    # read fields from line of pileup
    chromosome, position, reference, coverage, pileup, quality = line.rstrip("\r\n").split("\t")

    # proportion is 0 if no consensus base, top base is N by default
    parsed = {
        "proportion": 0.0,
        "chromosome": chromosome,
        "position": int(position),
        "reference": reference,
        "coverage": int(coverage),
        "pileup": pileup,
        "quality": quality,
        "top_base": "N"}

    # if the base coverage is below the limit or above our acceptable max, call N
    if parsed["coverage"] < min_coverage:
        return parsed

    # uppercase pileup string for processing
    pileup = pileup.upper()

    # Remove start and stop characters from pileup string
    pileup = re.sub(r"\^.|\$", "", pileup)

    # Remove indels from pileup string
    start = 0

    while True:
        match = INDEL_PATTERN.search(pileup, start)

        if match:
            integer = match.group(1)
            pileup = re.sub(INDEL_STRING.replace("INTEGER", integer), "", pileup)
            start = match.start()
        else:
            break

    # get total base count and top base count
    total = 0
    top_base = "N"
    top_base_count = 0

    # uppercase reference base for comparison
    reference = reference.upper()

    base_counts = {"A": 0, "C": 0, "G": 0, "T": 0}

    quality_length = len(quality)

    for i in range(quality_length):

        # convert ASCII character to phred base quality
        base_quality = ord(quality[i]) - ASCII_OFFSET

        # only count high-quality bases
        if base_quality >= min_qual:

            currBase = pileup[i]

            if currBase in base_counts:
                base_counts[currBase] += 1

            else:
                base_counts[reference] += 1

            total += 1

    parsed["total"] = total

    # find top base
    for base in base_counts:
        if base_counts[base] > top_base_count:
            top_base = base
            top_base_count = base_counts[base]

    # if more that 0 bases processed
    if total > 0:
        prop = top_base_count / float(total)

        if prop >= min_proportion:
            parsed["proportion"] = prop
            parsed["top_base"] = top_base

    return parsed

## process pileup file

# read in input file
if not DEBUG:
    out_file = open(out_file, "w")

with open(pileup_file, "rt") as pileup_file:
    for pileup_file_line in pileup_file:

        parsed = parse_pileup(pileup_file_line)

        proportion = parsed["proportion"]

        # print to output file
        if DEBUG:
            print("\t".join([parsed["chromosome"], str(parsed["position"]), parsed["reference"], parsed["top_base"], str(round(proportion, ROUND)),
                parsed["pileup"], parsed["quality"]]))
        else:
            print("\t".join([parsed["chromosome"], str(parsed["position"]), parsed["reference"], parsed["top_base"], str(round(proportion, ROUND)),
                parsed["pileup"], parsed["quality"]]), file=out_file)

if not DEBUG:
    out_file.close()

Integrating genotypes and phenotypes improves long-term forecasts of seasonal influenza A/H3N2 evolution (revised-submission)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import argparse
from augur.utils import read_node_data, write_json
import Bio.Align
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from collections import Counter
import numpy as np
import pandas as pd


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Calculate consensus sequence for all non-zero frequency strains and the distance of each strain from the resulting consensus.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument("--sequences", required=True, help="node data JSON containing sequences to find a consensus for")
    parser.add_argument("--frequencies", required=True, help="table of strain frequencies at the current timepoint")
    parser.add_argument("--sequence-attribute", default="aa_sequence", help="attribute in node data JSON containing the sequence data to use")
    parser.add_argument("--frequency-attribute", default="kde_frequency", help="attribute in frequency table representing the frequency data to use")
    parser.add_argument("--output", required=True, help="node data JSON with consensus sequence and distances from the consensus per strain")

    args = parser.parse_args()

    # Load sequence data from a node data JSON file.
    node_sequences = read_node_data(args.sequences)

    # Load frequency data.
    frequencies = pd.read_csv(args.frequencies, sep="\t")

    # Select names of strains with non-zero frequencies.
    strains = set(frequencies.query(f"{args.frequency_attribute} > 0.0")["strain"].values)

    # Select sequences for strains with non-zero frequencies.
    sequences = Bio.Align.MultipleSeqAlignment([
        SeqRecord(
            Seq(
                record[args.sequence_attribute],
            ),
            id=strain
        )
        for strain, record in node_sequences["nodes"].items()
        if strain in strains
    ])

    # Output will store the consensus sequence and the distance of each strain
    # to the consensus.
    output = {
        "nodes": {}
    }

    # Calculate the consensus sequence using a majority-rule approach where we
    # take the most common value in each column.
    consensus = "".join(
        Counter(sequences[:, i]).most_common(1)[0][0]
        for i in range(sequences.get_alignment_length())
    )
    output["consensus"] = consensus

    # Calculate the distance of each strain sequence from the consensus.
    consensus_array = np.frombuffer(consensus.encode(), dtype="S1")
    for sequence in sequences:
        sequence_array = np.frombuffer(str(sequence.seq).encode(), dtype="S1")
        distance = int((consensus_array != sequence_array).sum())
        output["nodes"][sequence.id] = {
            "distance_from_consensus": distance
        }

    # Output the results.
    write_json(output, args.output)
tool / biotools

Consensus

The Consensus server aligns a sequence to a structural template using a consensus of 5 different alignment methods. A measure of reliability is produced for each alignment position in order to predict the suitability of regions for comparative modelling.