Workflow Steps and Code Snippets
124 tagged steps and code snippets that match keyword Consensus
Bioinformatics pipeline for processing 16s marker gene metagenomic sequence data.
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 | library(dada2) #packageVersion("dada2") library(tidyverse) ## Set variables list_of_filenames <- snakemake@input$listfiles tax_ref <- snakemake@config$dada2_tax_ref spe_ref <- snakemake@config$dada2_spe_ref bacterial <- snakemake@config$bacterial ## Make a vector of sample names samples <- scan(list_of_filenames, what = "character") out <- readRDS(snakemake@input$filt_out) ## file names of forward and reverse reads, after quality filtering filtered_forward_reads <- file.path("output/temp/filtered", paste0(samples, "_r1_filtered.fastq.gz")) filtered_reverse_reads <- file.path("output/temp/filtered", paste0(samples, "_r2_filtered.fastq.gz")) ####### Step 2: Generate error model of data err_forward <- learnErrors(filtered_forward_reads, multithread = TRUE) err_reverse <- learnErrors(filtered_reverse_reads, multithread = TRUE) ####### Step 3: Derepliate sequences derep_forward <- derepFastq(filtered_forward_reads, verbose = TRUE) derep_reverse <- derepFastq(filtered_reverse_reads, verbose = TRUE) names(derep_forward) <- samples names(derep_reverse) <- samples ####### Step 4: Infer sequence variants dadaF <- dada(derep_forward, err = err_forward, multithread = TRUE) dadaR <- dada(derep_reverse, err = err_reverse, multithread = TRUE) ####### Step 5: Merge forward and reverse reads merged <- mergePairs(dadaF, derep_forward, dadaR, derep_reverse, verbose = TRUE) ####### Step 6: Generate count table seqtab <- makeSequenceTable(merged) ####### Step 7: Remove chimeras seqtab.nochim <- removeBimeraDenovo(seqtab, method = "consensus", multithread = TRUE) ## How are the reads concentrated in the merged sequence lengths readsbyseqlen <- tapply(colSums(seqtab.nochim), nchar(colnames(seqtab.nochim)),sum) pdf(file.path("output", "merged_sequence_lengths.pdf")) plot(as.integer(names(readsbyseqlen)),readsbyseqlen, xlab = "Merged length", ylab = "Total reads") dev.off() ####### Step 7b: Track reads throughout processing getN <- function(x) sum (getUniques(x)) summary_tab <- data.frame(row.names=samples, Input=out[,1], Filtered=out[,2], Denoised=sapply(dadaF, getN), Merged=sapply(merged, getN), Non.Chimeric=rowSums(seqtab.nochim), Total.Perc.Remaining = round(rowSums(seqtab.nochim)/out[,1]*100,1)) pdf(file.path("output", "reads_remaining.pdf")) hist(summary_tab$Total.Perc.Remaining, col = "blue", breaks = 50, main = "Total", xlab = "Percentage reads remaining") dev.off() ## Write this table to output write.table(summary_tab, file.path("output", "reads_tracked.txt")) summary_tab$Sample <- rownames(summary_tab) summary_tab <- summary_tab %>% separate(Sample, c("Sample", "temp"), sep = "_S") summary_tab$Sample <- factor(summary_tab$Sample, levels = summary_tab$Sample[order(summary_tab$Non.Chimeric)]) summary_tab_long <- summary_tab %>% gather("QC.Step", "Reads", Input:Non.Chimeric) summary_tab_long$QC.Step <- factor(summary_tab_long$QC.Step, levels = c("Input", "Filtered", "Denoised", "Merged", "Non.Chimeric")) gg <- ggplot(summary_tab_long, aes(x = Sample, y = Reads, color = QC.Step)) + geom_point(size = 2) + scale_color_manual(values = rainbow(5, v = 0.8)) + theme_bw() + theme(axis.text.x = element_text(angle = 90)) pdf(file.path("output", "read_tracking.pdf")) gg dev.off() ####### Step 8: Assign Taxonomy ################################################ ## To do: add parameter to config file to allow user to decide whether or not to allow multiples ################################################ taxa <- assignTaxonomy(seqtab.nochim, tax_ref, multithread = TRUE) ## Add Species (if 16S, not if ITS) if (bacterial == T){ taxa <- addSpecies (taxa, spe_ref, allowMultiple = TRUE) } ####### Step 9: Save output saveRDS(seqtab.nochim, file.path("output", "seqtab_final.rds")) saveRDS(taxa, file.path("output", "taxa_final.rds")) |
Analysis code for the TAP-seq manuscript.
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | # create consensus gRNA sequence for every detected (perturbations) gRNA for each cell gRNA_consensus_chr8 <- mm_perts8 %>% group_by(cell_barcode, vector_id) %>% summarize(grna_consensus_seq = consensusString(DNAStringSet(grna_seq))) gRNA_consensus_chr11 <- mm_perts11 %>% group_by(cell_barcode, vector_id) %>% summarize(grna_consensus_seq = consensusString(DNAStringSet(grna_seq))) # add expected gRNA sequence gRNA_consensus_chr8 <- gRNA_consensus_chr8 %>% left_join(select(gRNAs8, -vector_seq), by = "vector_id") gRNA_consensus_chr11 <- gRNA_consensus_chr11 %>% left_join(select(gRNAs11, -vector_seq), by = "vector_id") |
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | # function to compute edit distance between consensus and expected gRNA sequences calc_edit_dist <- function(x, y, method = "levenshtein") { as.integer(stringDist(c(x, y), method = method)) } # calculate edit distance between sequenced consensus gRNA and expected gRNA sequence gRNA_consensus_chr8 <- gRNA_consensus_chr8 %>% rowwise() %>% mutate(edit_dist = calc_edit_dist(grna_consensus_seq, grna_seq, method = "levenshtein")) gRNA_consensus_chr11 <- gRNA_consensus_chr11 %>% rowwise() %>% mutate(edit_dist = calc_edit_dist(grna_consensus_seq, grna_seq, method = "levenshtein")) noerr_chr8 <- mean(gRNA_consensus_chr8$edit_dist == 0) noerr_chr11 <- mean(gRNA_consensus_chr11$edit_dist == 0) |
173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 | # create one table gRNA_cons <- bind_rows(Chr8 = gRNA_consensus_chr8, Chr11 = gRNA_consensus_chr11, .id = "Sample") # create histogram of edit distance distribution ggplot(gRNA_cons, aes(edit_dist, fill = Sample)) + geom_histogram(position = "dodge", bins = 20) + labs(x = "Edit distance\n(gRNA consensus sequence - template)", y = "gRNA - cell combinations") + scale_fill_manual(values = c(Chr11 = "indianred3", Chr8 = "steelblue")) + theme_bw() + theme(panel.grid = element_blank(), aspect.ratio = 1, text = element_text(size = 15)) # save plot to .pdf file ggsave(filename = here("results/plots", "screen_gRNA_errors.pdf"), height = 4, width = 5) |
Obtain unbiased SNPs after FASTQ files bacoded for UMI
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | __author__ = "Aurora Campo" __copyright__ = "Copyright 2021, Aurora Campo, modified from git" __git__= "https://github.com/johanzi/make_pseudogenome" __email__ = "[email protected]" __license__ = "MIT" from snakemake.shell import shell #from snakemake_wrapper_utils.java import get_java_opts input = snakemake.input.get("vcf") extra = snakemake.params.get("extra") ref = snakemake.input.get("ref") pseudo = snakemake.output.get("pseudo") log = snakemake.log_fmt_shell(stdout=False, stderr=True) shell( "bcftools consensus " "{input} " "--sample {extra} " "--fasta-ref {ref} " "> {pseudo} " "{log}" ) |
Integrated Mapping and Profiling of Allelically-expressed Loci with Annotations (1.0.0)
589 590 591 592 | shell: """ cat {input.ref_promoter} | bcftools consensus {input.phase_vcf} -H {wildcards.num}pIu > {output} 2> {log} """ |
A tool for generating bacterial genomes from metagenomes with nanopore long read sequencing
379 380 381 | shell: """ medaka consensus {input[1]} {output} --model r941_flip213 --threads {threads} --regions {wildcards.range} """ |
559 560 561 | shell: """ bcftools consensus -f {input} -o {output} """ |
A straightforward bioinformatic pipeline for detecting a bacterial strain in one or more metagenome(s).
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | import sys import gzip import re DEBUG = 0 ASCII_OFFSET = ord("!") INDEL_PATTERN = re.compile(r"[+-](\d+)") INDEL_STRING = "[+-]INTEGER[ACGTN]{INTEGER}" ROUND = 3 # places to right of decimal point min_coverage, min_proportion, min_qual = snakemake.params pileup_file = snakemake.input[0] out_file = snakemake.output[0] min_coverage = int(min_coverage) min_proportion = float(min_proportion) min_qual = int(min_qual) ## function to process each line of pileup def parse_pileup(line): # read fields from line of pileup chromosome, position, reference, coverage, pileup, quality = line.rstrip("\r\n").split("\t") # proportion is 0 if no consensus base, top base is N by default parsed = { "proportion": 0.0, "chromosome": chromosome, "position": int(position), "reference": reference, "coverage": int(coverage), "pileup": pileup, "quality": quality, "top_base": "N"} # if the base coverage is below the limit or above our acceptable max, call N if parsed["coverage"] < min_coverage: return parsed # uppercase pileup string for processing pileup = pileup.upper() # Remove start and stop characters from pileup string pileup = re.sub(r"\^.|\$", "", pileup) # Remove indels from pileup string start = 0 while True: match = INDEL_PATTERN.search(pileup, start) if match: integer = match.group(1) pileup = re.sub(INDEL_STRING.replace("INTEGER", integer), "", pileup) start = match.start() else: break # get total base count and top base count total = 0 top_base = "N" top_base_count = 0 # uppercase reference base for comparison reference = reference.upper() base_counts = {"A": 0, "C": 0, "G": 0, "T": 0} quality_length = len(quality) for i in range(quality_length): # convert ASCII character to phred base quality base_quality = ord(quality[i]) - ASCII_OFFSET # only count high-quality bases if base_quality >= min_qual: currBase = pileup[i] if currBase in base_counts: base_counts[currBase] += 1 else: base_counts[reference] += 1 total += 1 parsed["total"] = total # find top base for base in base_counts: if base_counts[base] > top_base_count: top_base = base top_base_count = base_counts[base] # if more that 0 bases processed if total > 0: prop = top_base_count / float(total) if prop >= min_proportion: parsed["proportion"] = prop parsed["top_base"] = top_base return parsed ## process pileup file # read in input file if not DEBUG: out_file = open(out_file, "w") with open(pileup_file, "rt") as pileup_file: for pileup_file_line in pileup_file: parsed = parse_pileup(pileup_file_line) proportion = parsed["proportion"] # print to output file if DEBUG: print("\t".join([parsed["chromosome"], str(parsed["position"]), parsed["reference"], parsed["top_base"], str(round(proportion, ROUND)), parsed["pileup"], parsed["quality"]])) else: print("\t".join([parsed["chromosome"], str(parsed["position"]), parsed["reference"], parsed["top_base"], str(round(proportion, ROUND)), parsed["pileup"], parsed["quality"]]), file=out_file) if not DEBUG: out_file.close() |
Integrating genotypes and phenotypes improves long-term forecasts of seasonal influenza A/H3N2 evolution (revised-submission)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | import argparse from augur.utils import read_node_data, write_json import Bio.Align from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from collections import Counter import numpy as np import pandas as pd if __name__ == "__main__": parser = argparse.ArgumentParser( description="Calculate consensus sequence for all non-zero frequency strains and the distance of each strain from the resulting consensus.", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument("--sequences", required=True, help="node data JSON containing sequences to find a consensus for") parser.add_argument("--frequencies", required=True, help="table of strain frequencies at the current timepoint") parser.add_argument("--sequence-attribute", default="aa_sequence", help="attribute in node data JSON containing the sequence data to use") parser.add_argument("--frequency-attribute", default="kde_frequency", help="attribute in frequency table representing the frequency data to use") parser.add_argument("--output", required=True, help="node data JSON with consensus sequence and distances from the consensus per strain") args = parser.parse_args() # Load sequence data from a node data JSON file. node_sequences = read_node_data(args.sequences) # Load frequency data. frequencies = pd.read_csv(args.frequencies, sep="\t") # Select names of strains with non-zero frequencies. strains = set(frequencies.query(f"{args.frequency_attribute} > 0.0")["strain"].values) # Select sequences for strains with non-zero frequencies. sequences = Bio.Align.MultipleSeqAlignment([ SeqRecord( Seq( record[args.sequence_attribute], ), id=strain ) for strain, record in node_sequences["nodes"].items() if strain in strains ]) # Output will store the consensus sequence and the distance of each strain # to the consensus. output = { "nodes": {} } # Calculate the consensus sequence using a majority-rule approach where we # take the most common value in each column. consensus = "".join( Counter(sequences[:, i]).most_common(1)[0][0] for i in range(sequences.get_alignment_length()) ) output["consensus"] = consensus # Calculate the distance of each strain sequence from the consensus. consensus_array = np.frombuffer(consensus.encode(), dtype="S1") for sequence in sequences: sequence_array = np.frombuffer(str(sequence.seq).encode(), dtype="S1") distance = int((consensus_array != sequence_array).sum()) output["nodes"][sequence.id] = { "distance_from_consensus": distance } # Output the results. write_json(output, args.output) |
tool / biotools
Consensus
The Consensus server aligns a sequence to a structural template using a consensus of 5 different alignment methods. A measure of reliability is produced for each alignment position in order to predict the suitability of regions for comparative modelling.