GEMSCAN - GErMline Snp Small Indel CAller PipeliNe
Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation, topic
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
Joint variant calling with GATK4 HaplotypeCaller, Google DeepVariant 1.0.0 and Strelka2, coordinated via Snakemake.
Authors
-
Shalabh Suman, Bari Ballew and Wendy Wong
-
Technical Lead: Bin Zhu
Usage
If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and, if available, its DOI (see above). The documentation of this pipeline is at nci-cgr.github.io/gemscan/
Contribute back
In case you have also changed or added steps, please consider contributing them back to the original repository:
-
Fork the original repo to a personal or lab account.
-
Clone the fork to your local system.
-
Commit and push your changes to your fork.
-
Create a pull request against the original repository.
Code Snippets
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 | import argparse import csv import datetime import re import sys # global variables: # VCF structure (used instead of index numbers for readability) chrom = 0 pos = 1 snpID = 2 ref = 3 alt = 4 qual = 5 filt = 6 info = 7 frmt = 8 ############################################# Functions ############################################# def get_args(): """Handle command line arguments (input and output file names).""" parser = argparse.ArgumentParser( description="Takes VCF file with samples that have been called by the three callers and returns a VCF file where the genotypes from each caller are combined." ) parser.add_argument("infile", help="Input VCF file name") parser.add_argument("outfile", help="Output VCF file name") results = parser.parse_args() return results.infile, results.outfile def check_file(fname): """Check that file can be read; exit with error message if not.""" try: f = open(fname, "rb") f.close() return 0 except IOError: print("ERROR: Could not read file", fname) return 1 sys.exit() f.close() def get_header(infile): """Extract header from VCF. Exit with error message if no header detected. """ headerCheck = 1 with open(infile, "r") as file: for line in csv.reader(file, delimiter="\t"): if "#CHROM" in line: headerCheck = vcf_check(line) return line if headerCheck == 1: print("ERROR: File must contain header row matching VCF specification") return 1 sys.exit() def vcf_check(line): """Rudimentary format check. Must have #CHROM-designated header row and >=2 genotype columns. Must have an 3X number of genotype columns (does not actually check pairing). """ if (len(line) - 9) % 3 != 0 or len(line) < 12: print("ERROR: Unpaired sample names detected. File must contain triple number of samples.") return 1 sys.exit() # note that there should be (9 + 3 x no. of samples) number of columns else: return 0 def evaluate_variant_line(line, start1, end1, start2, end2, start3, end3): """For non-header lines, determines what needs to be done do merge the genotype information: 1. Add set annotation (HC, DV, strelka2, HC-DV, HC-strelka2, DV-strelka2, HC-DV-strelka2) to INFO 2. Removes empty genotypes for variants called by one caller 3. Removes chr_pos_ref_alt from ID field for DV-only variants 4. Integrates info for variants called by both """ if (":DV_" in line[frmt]) and not (":HC_" in line[frmt]) and not (":strelka2_" in line[frmt]): line = add_set_tag("DV", line) line[filt] = "oneCaller" line = remove_empty_genotypes(line, start1, end1, start2, end2, start3, end3) line[snpID] = "." return line elif (":HC_" in line[frmt]) and not (":DV_" in line[frmt]) and not (":strelka2_" in line[frmt]): line = add_set_tag("HC", line) line[filt] = "oneCaller" line = remove_empty_genotypes(line, start1, end1, start2, end2, start3, end3) return line elif (":DV_" in line[frmt]) and (":HC_" in line[frmt]) and not (":strelka2_" in line[frmt]): line = add_set_tag("HC-DV", line) line[filt] = "twoCallers" line = combine_genotypes(line, start1, end1, start2, end2, start3, end3) line[snpID] = "." return line elif (":strelka2_" in line[frmt]) and not (":HC_" in line[frmt]) and not (":DV_" in line[frmt]): line = add_set_tag("strelka2", line) line[filt] = "oneCaller" line = remove_empty_genotypes(line, start1, end1, start2, end2, start3, end3) line[snpID] = "." return line elif (":strelka2_" in line[frmt]) and (":HC_" in line[frmt]) and not (":DV_" in line[frmt]): line = add_set_tag("HC-strelka2", line) line[filt] = "twoCallers" line = combine_genotypes(line, start1, end1, start2, end2, start3, end3) line[snpID] = "." return line elif (":strelka2_" in line[frmt]) and (":DV_" in line[frmt]) and not (":HC_" in line[frmt]): line = add_set_tag("DV-strelka2", line) line[filt] = "twoCallers" line = combine_genotypes(line, start1, end1, start2, end2, start3, end3) line[snpID] = "." return line elif (":strelka2_" in line[frmt]) and (":HC_" in line[frmt]) and (":DV_" in line[frmt]): line = add_set_tag("HC-DV-strelka2", line) line[filt] = "threeCallers" line = combine_genotypes(line, start1, end1, start2, end2, start3, end3) line[snpID] = "." return line else: print("ERROR: No caller annotation found in FORMAT field.") return 1 sys.exit() def find_genotype_indices(line): """Determines the start/stop point for genotype columns from the three callers. bcftools merge --force-samples results in a VCF with samples as so: chr pos ... sample1 sample2 2:sample1 2:sample2 3:sample1 3:sample2, where the first two columns are called with caller1 and the second two with caller2, and so on. This function determines the index numbers defining the ranges (assuming the columns are not inter-mingled, e.g. sample1 2:sample1 2:sample2 sample2). Bear in mind that python slicing [start:stop] gets items start to stop-1. Example: vcf with 6 genotype fields at indices 9-14: 0,1,...8,9,10,11,12,13,14 see inline comments working through this example. """ start1 = 9 # start1=9; index=9 field end3 = len(line) # end2=15; index=15-1=14 field num_samples = int((end3 - 9) / 3) end1 = 9 + num_samples start2 = end1 end2 = 9 + num_samples * 2 start3 = end2 return start1, end1, start2, end2, start3, end3 def add_set_tag(caller, line): """Add set (HC, DV, HC-DV, strelka2, HC-strelka2, DV-strelka2 and HC-DV-strelka2) to INFO fields.""" line[info] = line[info] + ";set=" + caller return line def remove_empty_genotypes(line, start1, end1, start2, end2, start3, end3): """For variants found by only one caller, remove empty (.:.:.) fields. If only DeepVariant has call, set dv_priority_GT to the DV GT Set all consensus GT to ./. """ line[8] = line[8] + ":concensus_GT:dv_priority_GT" if any("0" in s for s in line[start1:end1]) or any("1" in s for s in line[start1:end1]): for i in range(start1, end1): line[i] = line[i] + ":" + "./." + ":" + "./." line = line[0:end1] return line elif any("0" in s for s in line[start2:end2]) or any("1" in s for s in line[start2:end2]): for i in range(start2, end2): s = line[i] geno = s.split(":") line[i] = line[i] + ":" + "./." + ":" + geno[0] line = line[0:9] + line[start2:end2] return line elif any("0" in s for s in line[start3:end3]) or any("1" in s for s in line[start3:end3]): for i in range(start3, end3): line[i] = line[i] + ":" + "./." + ":" + "./." line = line[0:9] + line[start3:end3] return line else: print("remove_empty_genotypes ERROR: All genotype fields are blank.") print(line) return 1 sys.exit() def flip_hets(gt): if gt == "1/0": gt = "0/1" return gt def get_concensus_gt(gt1, gt2, gt3): # In this order: HC, DV, strelka2 """ This function finds the consensus GT If all 3 callers have calls: and any of the two callers are concordant: GT is set to that concordant GT; or none are concordant: GT is set to ./. If only 2 callers have calls: and they are concordant: GT is set to that concordant GT; and they are not concordant: GT is set to ./. Finally, If only one caller has call, it's set to ./. """ if flip_hets(gt1) == flip_hets(gt2): concensus_gt = gt1 elif flip_hets(gt2) == flip_hets(gt3): concensus_gt = gt2 elif flip_hets(gt1) == flip_hets(gt3): concensus_gt = gt1 else: concensus_gt = "./." return concensus_gt def get_dv_priority_gt(gt1, gt2, gt3): # In this order: HC, DV, strelka2 """ This function finds the GT that's from DV call when possible. If there's no DV call: and HC and strelka2 calls the same genotype, GT is set to that genotype otherwise GT is set to ./. """ dv_priority_gt = "./." if gt2.count(".") == 0: dv_priority_gt = gt2 elif flip_hets(gt1) == flip_hets(gt3): dv_priority_gt = gt1 return dv_priority_gt def combine_genotypes(line, start1, end1, start2, end2, start3, end3): """For variants found by only two callers, integrate genotype info. Variants called by both callers will look like this: sample1 2:sample1 0/1:3:4,5:.:.:. .:.:.:0/1:2:4,3 We want the union of this information, e.g.: sample1 0/1:3:4,5:0/1:2:4,3 This function compares the three genotype fields sample1, 2:sample1 and 3:sample1, and anywhere there's a '.' in sample1, it updates it with non-'.' data from 2:sample1 or 3:sample1 if available. This assumes that the VCF is well-formed, meaning each genotype column conforms equally to the FORMAT definition. This function also updates the GT column. If all 3 callers have calls: and any of the two callers are concordant: GT is set to that concordant GT; or none are concordant: GT is set to ./. If only 2 callers have calls: and they are concordant: GT is set to that concordant GT; and they are not concordant: GT is set to ./. Finally, If only one caller has call, it's set to that GT """ for x, y, z in zip(line[start1:end1], line[start2:end2], line[start3:end3]): geno1 = x.split(":") geno2 = y.split(":") geno3 = z.split(":") field = line.index(x) for i, g1 in enumerate(geno1): if i == 0: if (geno1[i] != geno2[i]) and (geno1[i] != geno3[i]) and (geno2[i] != geno3[i]): geno1[i] = "./." elif geno2[i] == geno3[i]: geno1[i] = geno2[i] if (geno1[i] == ".") and (geno2[i] != "."): geno1[i] = geno2[i] if (geno1[i] == ".") and (geno2[i] == ".") and (geno3[i] != "."): geno1[i] = geno3[i] line[field] = ":".join(geno1) # add GT concensus_gt = get_concensus_gt(geno1[0], geno2[0], geno3[0]) # print ("for dv:" + geno1[0] + " " + geno2[0] + " " + geno3[0]) dv_priority_gt = get_dv_priority_gt(geno1[0], geno2[0], geno3[0]) # print ("dv_priority_gt:" + dv_priority_gt) line[field] = line[field] + ":" + concensus_gt + ":" + dv_priority_gt # add field to format line[start1 - 1] = line[start1 - 1] + ":concensus_GT:dv_priority_GT" return line[0:end1] def add_headers(ts, ver, scriptName, cmdString): """Add metadata to the vcf To A) account for new INFO field and to B) document provenance. ###TODO: add reference info as well? """ infoHeader = '##INFO=<ID=set,Number=.,Type=String,Description="Set of callers that identified a variant (HC, DV, strelka2, HC-DV, HC-strelka2, DV-strelka2, or HC-DV-strelka2 )">' filterHeaderOneCaller = ( '##FILTER=<ID=oneCaller,Description="The variant was called by exactly one caller">' ) filterHeaderTwoCallers = ( '##FILTER=<ID=twoCallers,Description="The variant was called by exactly two callers">' ) filterHeaderThreeCallers = ( '##FILTER=<ID=threeCallers,Description="The variant was called by all three callers">' ) formatHeaderConcensusGT = ( '##FORMAT=<ID=concensus_GT,Number=1,Type=String,Description="Genotype">' ) formatHeaderDVPriorityGT = ( '##FORMAT=<ID=dv_priority_GT,Number=1,Type=String,Description="Genotype">' ) prov1 = ( "##" + scriptName + "_Version=" + ver + ", Union of HC, DV and strelka2 genotype data, " + ts ) prov2 = "##" + scriptName + "_Command=" + cmdString return [ filterHeaderOneCaller, filterHeaderTwoCallers, filterHeaderThreeCallers, infoHeader, formatHeaderConcensusGT, formatHeaderDVPriorityGT, prov1, prov2, ] ##################################################################################################### if __name__ == "__main__": ts = str(datetime.datetime.now()) ver = "someversion" # https://stackoverflow.com/questions/5581722/how-can-i-rewrite-python-version-with-git scriptName = sys.argv[0] cmdString = " ".join(sys.argv) infile, outfile = get_args() check_file(infile) headerLine = get_header(infile) start1, end1, start2, end2, start3, end3 = find_genotype_indices(headerLine) with open(infile, "r") as file, open(outfile, "w") as out: for line in csv.reader(file, delimiter="\t"): if re.search(r"#", line[chrom]) is None: line = evaluate_variant_line(line, start1, end1, start2, end2, start3, end3) out.write("\t".join(line) + "\n") elif "#CHROM" in line: out.write("\n".join(add_headers(ts, ver, scriptName, cmdString)) + "\n") out.write("\t".join(line[0:start2]) + "\n") else: out.write("\t".join(line) + "\n") |
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | set -euo pipefail inFile=$1 headerFile=$2 caller=$3 inFileName="${inFile##*/}" inFilePath="${inFile%/*}" headerName="${headerFile##*/}" headerPath="${headerFile%/*}" outFile="${inFilePath}/prepended.${inFileName}" outHeader="${headerPath}/prepended.${headerName}" cut -f8 "${inFile}" | awk -v var="${caller}" 'BEGIN{FS=OFS=";"} {for(i=1;i<=NF;i++) $i=var"_"$i; print $0}' > "info.${inFileName}.temp" cut -f9 "${inFile}" | awk -v var="${caller}" 'BEGIN{FS=OFS=":"} {for(i=1;i<=NF;i++) $i=var"_"$i; print "GT:"$0}' > "format.${inFileName}.temp" cut -f1-7 "${inFile}" > "left.${inFileName}.temp" cut -f10- "${inFile}" | awk 'BEGIN{FS=OFS="\t"} {for(i=1;i<=NF;i++) {split($i,a,":"); $i=a[1]":"$i} print $0}' > "right.${inFileName}.temp" # need to duplicate GT and preserve one as plain "GT" to allow bcftools merge to work as expected paste "left.${inFileName}.temp" "info.${inFileName}.temp" "format.${inFileName}.temp" "right.${inFileName}.temp" | awk -v var="${caller}" 'BEGIN{FS=OFS="\t"} {$8 = $8";"var"_QUAL="$6; print $0}' > "${outFile}" rm "left.${inFileName}.temp" "info.${inFileName}.temp" "format.${inFileName}.temp" "right.${inFileName}.temp" sed "s/##INFO=<ID=/##INFO=<ID=${caller}_/" "${headerFile}" | sed "s/##FORMAT=<ID=/##FORMAT=<ID=${caller}_/" | sed "/##FORMAT=<ID=${caller}_GT/ i\##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Concordant Genotype\">\n##INFO=<ID=${caller}_QUAL,Number=1,Type=String,Description=\"Record of original QUAL value\">" > "${outHeader}" |
Support
- Future updates
Related Workflows





