Benchmarking of CNV calling tools

public 1yr ago 0 bookmarks

View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output, operation, topic

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

CNV tool benchmarking pipeline

This pipeline was set up to test a variety of CNV calling tools on WES and WGS samples sequenced using Illumina short-read sequencing technology.

Installation and configuration

This pipeline is set up as a Snakemake workflow. More information on how to install and run Snakemake can be found at the wiki .

After installation of Snakemake, clone this repository and fill in the config.yaml.

This pipeline is set up to work with the Environment Modules . It likely needs customisation or removal of the module statements in the rules ( tools/*/rules.smk ) for compatilibility with a given computational environment.

Some of the tools in this pipeline requires a panel of normals, which the pipeline does not generate automatically. One such tool is GATK GermlineCNVCaller . More info on generating a panel of normals for this tool can be found at the GATKs tool documentation

Running the pipeline

Once the pipeline has been configured, a given tool can be run using the command snakemake $tool .

A list of available Snakemake targets can be generated by running snakemake --list-target-rules .

By default, Snakemake runs all the tools specified in the config.yaml by just running: snakemake .

Many options are available for customizing snakemake behaviour. These are all documented at the Snakemake Wiki .

Sample data from project

The alignment file GB-WGS-NA12878.bam used in this project is available at the NCBI Sequencing Read Archive (SRA) .

Code Snippets

get_script_dir <- function() {
	initial.options <- commandArgs(trailingOnly = FALSE)
	file.arg.name <- "--file="
	script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)])
	return(dirname(script.name))
}

setwd(paste(get_script_dir(), "..", sep ="/"))

suppressPackageStartupMessages(library("rtracklayer", warn.conflicts = TRUE))
library("yaml")

RESULTSDIR <- "../results"
TRUTHDIR   <- "truth_set"
OUTDIR     <- "results_filtered"
TOOLS     <- yaml::yaml.load_file("../tools.yaml")

# MIN_OVERLAP <- 0.10 ## Fraction, i.e. 0.10 == "10 %"
MIN_OVERLAP <- 0 ## Disable overlap filtering

BED <- list(
	WGS = import("../resources/hg19/regions/WGS_mappable_regions.bed"),
	WES = import("../resources/hg19/regions/Broad_exomes.bed")
)

## Check tool results ---------------------------------
callset <- list()

IGNORED_TOOLS <- c("alignments", "TIDDIT", "SvABA")

tool_dirs <- list.dirs(RESULTSDIR, recursive=FALSE, full.names = FALSE)
tool_dirs <- subset(tool_dirs, tool_dirs %in% c(TOOLS$WES, TOOLS$WGS))
for (tool in tool_dirs) {
	tool_files <- list.files(paste(RESULTSDIR, tool, sep = "/"), pattern = ".bed$")
	for (bed_file in tool_files) {
		message("NOW: ", paste(tool, bed_file, sep = "/"))
		bed      <- import(paste(RESULTSDIR, tool, bed_file, sep = "/"), genome = "GRCh37")
		bed$name <- as.numeric(bed$name)
		bed$name <- ifelse(bed$name > 2, "DUP", ifelse(bed$name == 2, "REF", "DEL"))
		bed$pval <- bed$score
		bed$score <- 0

		## Filter called CNVs
		# bed <- subset(bed, width(bed) >= 450 & bed$name != "REF")
		bed <- subset(bed, bed$name != "REF" & !seqnames(bed) %in% c("X", "Y", "MT"))

		## Subset to library region
		if (startsWith(bed_file, "GB-WGS")) {library = "WGS"} else {library = "WES"}
		bed <- subsetByOverlaps(bed, BED[[library]], minoverlap = 1, ignore.strand = TRUE)

		## Compare with truth set -----------------------------
		truth_file <- paste(TRUTHDIR, bed_file, sep = "/")
		if (file.exists(truth_file)) {
			truth    <- import(truth_file, genome = "GRCh37")
			truth$pval  <- -1
			hits     <- findOverlaps(bed, truth)

			## Check that CNV type is the same for call set and truth (truth$name == "CNV" is for NA12878)
			hits     <- hits[bed$name[queryHits(hits)] == truth$name[subjectHits(hits)] | truth$name[subjectHits(hits)] == "CNV"]

			## Count called CNVs that overlap a true CNV by at least MIN_OVERLAP
			overlaps <- pintersect(bed[queryHits(hits)], truth[subjectHits(hits)])
			percentOverlap <- width(overlaps) / width(bed[queryHits(hits)])
			hits <- hits[percentOverlap >= MIN_OVERLAP]
			bed[queryHits(hits)]$score <- percentOverlap[percentOverlap >= MIN_OVERLAP]

			## Append true positives for later counting
			truth$score <- Inf
			truth_called <- truth[unique(subjectHits(hits))]
			bed <- c(bed, truth_called)

			## Append false negative calls for later counting
			truth$score <- -Inf
			truth_not_called <- truth[setdiff(seq_along(truth), subjectHits(hits))]
			hits_not_called <- findOverlaps(truth_not_called, BED[[library]], minoverlap = 1, ignore.strand = TRUE)
			bed <- c(bed, truth_not_called[unique(queryHits(hits_not_called))])

			## Add calls to callset list
			sample <- sub("\\.bed$", "", bed_file)
			if (! sample %in% names(callset)) {
				callset[[sample]] <- list(
					cnv = paste(as.character(truth), truth$name, sep = "_")
				)
			}

			# calls <- rep(0, length(truth))
			pvals <- rep(NA, length(truth))
			pvals[subjectHits(hits)] <- bed$pval[queryHits(hits)]
			# calls[subjectHits(hits)] <- 1
			# names(calls) <- paste(as.character(truth), truth$name, sep = "_")
			callset[[sample]][[tool]] <- pvals
		}
		dir.create(paste(OUTDIR, tool, sep = "/"), showWarnings = FALSE)
		bed_string <- export(bed, format = "bed")
		outfile <- paste(OUTDIR, tool, bed_file, sep = "/")
		writeLines(paste(bed_string, bed$pval, sep = "\t"), outfile)
	}
}
callset.df     <- lapply(callset, as.data.frame)
callset.melted <- lapply(callset.df, reshape::melt, id.vars = "cnv", variable_name = "tool")
for (i in names(callset.melted)) {
	callset.melted[[i]]$sample <- i
}
callset.melted$make.row.names = FALSE
callset.final <- do.call("rbind", callset.melted)
# callset.final <- callset.final[, c(4, 1, 2, 3)]

write.table(callset.final, file = "callset.txt", sep = "\t", quote = FALSE, row.names = FALSE)

R yaml rtracklayer From line 2 of scripts/01_filter_results.R

get_script_dir <- function() {
	initial.options <- commandArgs(trailingOnly = FALSE)
	file.arg.name <- "--file="
	script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)])
	return(dirname(script.name))
}
setwd(paste(get_script_dir(), "..", sep ="/"))

suppressPackageStartupMessages(library("data.table"))
library("yaml")
library("lubridate", warn.conflicts = FALSE)

TRUTHDIR   <- "truth_set"
RESULTSDIR <- "results_filtered"
BENCHMARKDIR <- "../benchmarks"
TOOLS     <- yaml::yaml.load_file("../tools.yaml")

## Functions -----------------------------
flatten_mapply_results <- function(mapply_out.list) {
	mapply_out.df   <- lapply(mapply_out.list, as.data.frame)
	mapply_out.df$make.row.names = FALSE
	mapply_out      <- do.call("rbind", mapply_out.df)
	return(mapply_out)
}

extract_results <- function(sample, tool, resultsdir = RESULTSDIR, truthdir = TRUTHDIR) {
	res <- list(
		sample  = sample,
		tool    = tool,
		library = NA,
		TP      = NA,
		FP      = NA,
		FN      = NA,
		TP_FN   = NA,
		N_truth = NA,
		N_DEL   = NA,
		N_DUP   = NA
	)
	if (startsWith(sample, "GB-WGS")) {res$library = "WGS"} else {res$library = "WES"}
	if (tool == "Cytoscan") {
		bed_file <- sprintf("%s/%s.bed", truthdir, sample)
	} else {
		bed_file <- sprintf("%s/%s/%s.bed", resultsdir, tool, sample)
	}
	if (file.exists(bed_file)) {
		bed <- fread(bed_file, colClasses=list(character = 1))
		if (tool == "Cytoscan") {
			res$N_DEL <- sum(bed$V4 == "DEL")
			res$N_DUP <- sum(bed$V4 == "DUP")
		} else {
			res$N_DEL <- sum(bed$V4 == "DEL" & Inf > bed$V5 & bed$V5 >= 0)
			res$N_DUP <- sum(bed$V4 == "DUP" & Inf > bed$V5 & bed$V5 >= 0)
			truth_file <- sprintf("%s/%s.bed", truthdir, sample)
			if (file.exists(truth_file)) {
				# res$TP <- sum(0 < bed$V5 & bed$V5 <= 1) + sum(bed$V5[1 < bed$V5 & bed$V5 < Inf])
				# res$FN <- sum(bed$V5 == -1)
				# res$FP <- sum(bed$V5 ==  0)
				res$TP <- sum(bed$V5 ==  Inf)
				res$FN <- sum(bed$V5 == -Inf)
				res$FP <- with(res, N_DUP + N_DEL - TP)
				res$TP_FN <- with(res, TP + FN)
				res$N_truth <- R.utils::countLines(truth_file)[1]
			}
		}
	}
	return(res)
}

extract_hist_data <- function(sample, tool, breaks, resultsdir = RESULTSDIR, truthdir = TRUTHDIR) {
	res <- list(
		sample  = sample,
		tool    = tool,
		library = NA,
		bin     = NA,
		counts  = NA
	)
	if (startsWith(sample, "GB-WGS")) {res$library = "WGS"} else {res$library = "WES"}
	if (tool == "Cytoscan") {
		bed_file <- sprintf("%s/%s.bed", truthdir, sample)
	} else {
		bed_file <- sprintf("%s/%s/%s.bed", resultsdir, tool, sample)
	}
	if (file.exists(bed_file)) {
		bed <- fread(bed_file, colClasses=list(character = 1))
		if (tool != "Cytoscan") {
			# bed <- subset(bed, bed$V5 >= 0)
			bed <- subset(bed, Inf > bed$V5 & bed$V5 >= 0)
		}
		bed$width <- bed$V3 - bed$V2
		h <- hist(bed$width, breaks, plot = FALSE)
		res$bin    <- sprintf("%.0f-%.0f", head(breaks, -1), tail(breaks, -1))
		res$counts <- h$counts
	}
	return(res)
}

extract_mlpa_data <- function(sample, tool, resultsdir = RESULTSDIR, truthdir = TRUTHDIR) {
	res <- list(
		sample  = sample,
		tool    = tool,
		library = NA,
		overlap = NA
	)
	if (startsWith(sample, "GB-WGS")) {res$library = "WGS"} else {res$library = "WES"}
	bed_file <- sprintf("%s/%s/%s.bed", resultsdir, tool, sample)
	if (file.exists(bed_file)) {
		bed <- fread(bed_file, colClasses=list(character = 1))
		# if (any(bed$V5 == -1) || all(bed$V5) == 0) {
		# 	res$overlap = 0
		# } else {
		# 	res$overlap <- max(bed$V5)
		# }
		bed <- subset(bed, bed$V5 <= 1)
		res$overlap <- max(bed$V5, 0)
	}
	return(res)
}

extract_benchmarks <- function(sample, tool, benchdir = BENCHMARKDIR) {
	res <- list(
		sample  = sample,
		tool    = tool,
		library = NA,
		cputime = NA,
		peak_memory = NA
	)
	if (startsWith(sample, "GB-WGS")) {res$library = "WGS"} else {res$library = "WES"}
	if (tool == "CLC") {
		input <- sprintf("%s/%s/cpu_time.tsv", benchdir, tool)
		if (file.exists(input)) {
			df <- fread(input)
			df <- subset(df, df$sample == res$sample)
			res$cputime <- as.numeric(hm(df$`elapsed time`))
		}
	} else {
		if (tool %in% c("CNVkit", "CODEX2") || (tool == "cn.MOPS" && res$library == "WES")) {
			input <- Sys.glob(sprintf("%s/%s*/*%s*.txt", benchdir, tool, res$library))
		} else {
			input <- Sys.glob(sprintf("%s/%s*/*%s*.txt", benchdir, tool, sample))
		}
		if (length(input) > 0) {
			df <- do.call("rbind", lapply(input, fread))
			res$cputime     <- sum(df$s)
			res$peak_memory <- max(df$max_rss)
		}
	}
	return(res)
}

## Initialize ---------------------------
# tool_dirs   <- list.dirs(RESULTSDIR, recursive=FALSE, full.names = FALSE)
tool_dirs   <- union(TOOLS$WGS, TOOLS$WES)
cohorts <- fread("../cohorts.txt")
samples <- list(
	all = subset(cohorts, cohorts$cohort != "mlpa")$sample
)
samples$WES <- subset(samples$all, grepl("WES", samples$all))
samples$WGS <- subset(samples$all, grepl("WGS", samples$all))

## Summarize results -----------------------------
message("Creating summary.txt")
l <- ifelse(tool_dirs %in% TOOLS$WGS & tool_dirs %in% TOOLS$WES, list(c(samples$WGS, samples$WES)), ifelse(tool_dirs %in% TOOLS$WES, list(samples$WES), list(samples$WGS)))
map_args <- list(
	sample = do.call("c", l),
	tool   = unlist(mapply(rep, tool_dirs, sapply(l, length)), use.names = FALSE)
)

summary <- flatten_mapply_results(mapply(extract_results, map_args$sample, map_args$tool, SIMPLIFY = FALSE))
summary$precision <- with(summary, TP / (TP + FP))
summary$recall    <- with(summary, TP / (TP + FN))
fwrite(summary, file = "summary.txt", sep = "\t", na = ".", quote = FALSE)


## Make histogram data ---------------------------
message("Creating hist_data.txt")
hist_breaks <- c(0L, 500L, 1000L, 10000L, 100000L, Inf)
hist_data <- flatten_mapply_results(mapply(extract_hist_data, map_args$sample, map_args$tool, MoreArgs = list(breaks = hist_breaks), SIMPLIFY = FALSE))
fwrite(hist_data, file = "hist_data.txt", sep = "\t", na = ".", quote = FALSE)


## Summarize MLPA results ----------------------------
message("Creating mlpa_overlap.txt")
MLPATRUTHDIR <- "truth_set/mlpa"
mlpa_samples <- list.files(MLPATRUTHDIR)

samples$MLPA = subset(cohorts, cohorts$cohort == "mlpa")$sample
samples$MLPA_WES <- subset(samples$MLPA, grepl("WES", samples$MLPA))
samples$MLPA_WGS <- subset(samples$MLPA, grepl("WGS", samples$MLPA))

tool_dirs <- subset(tool_dirs, tool_dirs != "Cytoscan")
l <- ifelse(tool_dirs %in% TOOLS$WGS & tool_dirs %in% TOOLS$WES, list(c(samples$MLPA_WGS, samples$MLPA_WES)), ifelse(tool_dirs %in% TOOLS$WES, list(samples$MLPA_WES), list(samples$MLPA_WGS)))
mlpa_args <- list(
	sample = do.call("c", l),
	tool   = unlist(mapply(rep, tool_dirs, sapply(l, length)), use.names = FALSE)
)

mlpa_overlap    <- flatten_mapply_results(mapply(extract_mlpa_data, mlpa_args$sample, mlpa_args$tool, SIMPLIFY = FALSE))
fwrite(mlpa_overlap, file = "mlpa_overlap.txt", sep = "\t", na = ".", quote = FALSE)


## Summarize computational benchmark results --------------------------
message("Creating benchmarks.txt")
samples$REF = subset(cohorts, cohorts$cohort == "reference")$sample
samples$REF_WES <- subset(samples$REF, grepl("WES", samples$REF))
samples$REF_WGS <- subset(samples$REF, grepl("WGS", samples$REF))

benchmark_args <- list(
	sample = c(rep(samples$REF_WES, length(TOOLS$WES)), rep(samples$REF_WGS, length(TOOLS$WGS))),
	tool   = c(TOOLS$WES, TOOLS$WGS)
)
benchmarks <- flatten_mapply_results(mapply(extract_benchmarks, benchmark_args$sample, benchmark_args$tool, SIMPLIFY = FALSE))
fwrite(benchmarks, file = "benchmarks.txt", sep = "\t", na = ".", quote = FALSE)

R data.table yaml lubridate From line 2 of scripts/02_summarize_results.R

shell:
	"(module load tools samtools/1.9 bwa/{version}; "
	"bwa mem -M -t {threads} -R {params.rg} {input.ref} {input.reads} > {output.sam}; "
	") 2> {log}"

SnakeMake BWA From line 24 of tools/align.smk

shell:
	"(module load tools perl/5.24.0 samtools/1.9 biobambam2/{version}; "

SnakeMake From line 44 of tools/align.smk

shell:
	"(module load tools java/1.8.0 gatk/{version}; "

SnakeMake From line 62 of tools/align.smk

shell:
	"(module load tools java/1.8.0 samtools/1.9 gatk/{version}; "
	"gatk BaseRecalibrator "
	"--known-sites {input.dbsnp} "
	"--known-sites {input.indels} "
	"--known-sites {input.snps1k} "
	"--known-sites {input.indels1k} "
	"-R {input.ref} "
	"-I {input.bam} "
	"-O {output.bqsr} "
	") 2> {log} "

SnakeMake gatk From line 97 of tools/align.smk

shell:
	"(module load tools java/1.8.0 samtools/1.9 gatk/{version}; "
	"gatk ApplyBQSR -R {input.ref} "
	"-bqsr {input.bqsr} "
	"-I {input.bam} "
	"-O {output.bam}; "
	"samtools index -@ {threads} {output.bam} {output.bai}"
	") 2> {log}"

SnakeMake SAMtools gatk From line 126 of tools/align.smk

shell:
	"(module load tools samtools/{version}; "
	"samtools view -C -T {input.ref} -@ {threads} {input.bam} > {output.cram}; "
	"samtools index -@ {threads} {output.cram} {output.crai}; "
	") 2> {log}"

SnakeMake SAMtools From line 149 of tools/align.smk

shell:
	"cat {input.wes} {input.wgs} | sort -k1,1 -k2,2 -V | uniq > {output.bed}; "
	"cat {input.wes} | sort -k1,1 -k2,2 -V | uniq > {output.wes}; "
	"cat {input.wgs} | sort -k1,1 -k2,2 -V | uniq > {output.wgs}; "

SnakeMake From line 163 of tools/align.smk

shell:
	"module load bedtools/2.27.1; "
	"bedtools subtract -a {input.truth} -b {input.region} > {output.bed}"

SnakeMake BEDTools From line 174 of tools/align.smk

shell:
	"(module load tools htslib/1.9 mosdepth/0.2.4; "
	"MOSDEPTH_PRECISION=5 mosdepth -f {input.ref} -n --by {input.bed} {params.prefix} {input.cram}; "
	") 2> {log}"

SnakeMake mosdepth From line 192 of tools/align.smk

shell:
	"(module load intel/redist/2019_update2 intel/compiler/64/2019_update2 R/3.5.0; "
	"Rscript concordance/scripts/01_filter_results.R"
	") 2> {log}"

SnakeMake From line 9 of tools/concordance.smk

shell:
	"(module load intel/redist/2019_update2 intel/compiler/64/2019_update2 R/3.5.0; "
	"Rscript concordance/scripts/02_summarize_results.R"
	") 2> {log}"