Benchmarking of CNV calling tools

public public 1yr ago 0 bookmarks

CNV tool benchmarking pipeline

This pipeline was set up to test a variety of CNV calling tools on WES and WGS samples sequenced using Illumina short-read sequencing technology.

Installation and configuration

This pipeline is set up as a Snakemake workflow. More information on how to install and run Snakemake can be found at the wiki .

After installation of Snakemake, clone this repository and fill in the config.yaml.

This pipeline is set up to work with the Environment Modules . It likely needs customisation or removal of the module statements in the rules ( tools/*/rules.smk ) for compatilibility with a given computational environment.

Some of the tools in this pipeline requires a panel of normals, which the pipeline does not generate automatically. One such tool is GATK GermlineCNVCaller . More info on generating a panel of normals for this tool can be found at the GATKs tool documentation

Running the pipeline

Once the pipeline has been configured, a given tool can be run using the command snakemake $tool .

A list of available Snakemake targets can be generated by running snakemake --list-target-rules .

By default, Snakemake runs all the tools specified in the config.yaml by just running: snakemake .

Many options are available for customizing snakemake behaviour. These are all documented at the Snakemake Wiki .

Sample data from project

The alignment file GB-WGS-NA12878.bam used in this project is available at the NCBI Sequencing Read Archive (SRA) .

Code Snippets

  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
get_script_dir <- function() {
	initial.options <- commandArgs(trailingOnly = FALSE)
	file.arg.name <- "--file="
	script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)])
	return(dirname(script.name))
}

setwd(paste(get_script_dir(), "..", sep ="/"))

suppressPackageStartupMessages(library("rtracklayer", warn.conflicts = TRUE))
library("yaml")

RESULTSDIR <- "../results"
TRUTHDIR   <- "truth_set"
OUTDIR     <- "results_filtered"
TOOLS     <- yaml::yaml.load_file("../tools.yaml")

# MIN_OVERLAP <- 0.10 ## Fraction, i.e. 0.10 == "10 %"
MIN_OVERLAP <- 0 ## Disable overlap filtering

BED <- list(
	WGS = import("../resources/hg19/regions/WGS_mappable_regions.bed"),
	WES = import("../resources/hg19/regions/Broad_exomes.bed")
)

## Check tool results ---------------------------------
callset <- list()

IGNORED_TOOLS <- c("alignments", "TIDDIT", "SvABA")

tool_dirs <- list.dirs(RESULTSDIR, recursive=FALSE, full.names = FALSE)
tool_dirs <- subset(tool_dirs, tool_dirs %in% c(TOOLS$WES, TOOLS$WGS))
for (tool in tool_dirs) {
	tool_files <- list.files(paste(RESULTSDIR, tool, sep = "/"), pattern = ".bed$")
	for (bed_file in tool_files) {
		message("NOW: ", paste(tool, bed_file, sep = "/"))
		bed      <- import(paste(RESULTSDIR, tool, bed_file, sep = "/"), genome = "GRCh37")
		bed$name <- as.numeric(bed$name)
		bed$name <- ifelse(bed$name > 2, "DUP", ifelse(bed$name == 2, "REF", "DEL"))
		bed$pval <- bed$score
		bed$score <- 0

		## Filter called CNVs
		# bed <- subset(bed, width(bed) >= 450 & bed$name != "REF")
		bed <- subset(bed, bed$name != "REF" & !seqnames(bed) %in% c("X", "Y", "MT"))

		## Subset to library region
		if (startsWith(bed_file, "GB-WGS")) {library = "WGS"} else {library = "WES"}
		bed <- subsetByOverlaps(bed, BED[[library]], minoverlap = 1, ignore.strand = TRUE)

		## Compare with truth set -----------------------------
		truth_file <- paste(TRUTHDIR, bed_file, sep = "/")
		if (file.exists(truth_file)) {
			truth    <- import(truth_file, genome = "GRCh37")
			truth$pval  <- -1
			hits     <- findOverlaps(bed, truth)

			## Check that CNV type is the same for call set and truth (truth$name == "CNV" is for NA12878)
			hits     <- hits[bed$name[queryHits(hits)] == truth$name[subjectHits(hits)] | truth$name[subjectHits(hits)] == "CNV"]

			## Count called CNVs that overlap a true CNV by at least MIN_OVERLAP
			overlaps <- pintersect(bed[queryHits(hits)], truth[subjectHits(hits)])
			percentOverlap <- width(overlaps) / width(bed[queryHits(hits)])
			hits <- hits[percentOverlap >= MIN_OVERLAP]
			bed[queryHits(hits)]$score <- percentOverlap[percentOverlap >= MIN_OVERLAP]

			## Append true positives for later counting
			truth$score <- Inf
			truth_called <- truth[unique(subjectHits(hits))]
			bed <- c(bed, truth_called)

			## Append false negative calls for later counting
			truth$score <- -Inf
			truth_not_called <- truth[setdiff(seq_along(truth), subjectHits(hits))]
			hits_not_called <- findOverlaps(truth_not_called, BED[[library]], minoverlap = 1, ignore.strand = TRUE)
			bed <- c(bed, truth_not_called[unique(queryHits(hits_not_called))])

			## Add calls to callset list
			sample <- sub("\\.bed$", "", bed_file)
			if (! sample %in% names(callset)) {
				callset[[sample]] <- list(
					cnv = paste(as.character(truth), truth$name, sep = "_")
				)
			}

			# calls <- rep(0, length(truth))
			pvals <- rep(NA, length(truth))
			pvals[subjectHits(hits)] <- bed$pval[queryHits(hits)]
			# calls[subjectHits(hits)] <- 1
			# names(calls) <- paste(as.character(truth), truth$name, sep = "_")
			callset[[sample]][[tool]] <- pvals
		}
		dir.create(paste(OUTDIR, tool, sep = "/"), showWarnings = FALSE)
		bed_string <- export(bed, format = "bed")
		outfile <- paste(OUTDIR, tool, bed_file, sep = "/")
		writeLines(paste(bed_string, bed$pval, sep = "\t"), outfile)
	}
}
callset.df     <- lapply(callset, as.data.frame)
callset.melted <- lapply(callset.df, reshape::melt, id.vars = "cnv", variable_name = "tool")
for (i in names(callset.melted)) {
	callset.melted[[i]]$sample <- i
}
callset.melted$make.row.names = FALSE
callset.final <- do.call("rbind", callset.melted)
# callset.final <- callset.final[, c(4, 1, 2, 3)]

write.table(callset.final, file = "callset.txt", sep = "\t", quote = FALSE, row.names = FALSE)
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
get_script_dir <- function() {
	initial.options <- commandArgs(trailingOnly = FALSE)
	file.arg.name <- "--file="
	script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)])
	return(dirname(script.name))
}
setwd(paste(get_script_dir(), "..", sep ="/"))

suppressPackageStartupMessages(library("data.table"))
library("yaml")
library("lubridate", warn.conflicts = FALSE)

TRUTHDIR   <- "truth_set"
RESULTSDIR <- "results_filtered"
BENCHMARKDIR <- "../benchmarks"
TOOLS     <- yaml::yaml.load_file("../tools.yaml")

## Functions -----------------------------
flatten_mapply_results <- function(mapply_out.list) {
	mapply_out.df   <- lapply(mapply_out.list, as.data.frame)
	mapply_out.df$make.row.names = FALSE
	mapply_out      <- do.call("rbind", mapply_out.df)
	return(mapply_out)
}

extract_results <- function(sample, tool, resultsdir = RESULTSDIR, truthdir = TRUTHDIR) {
	res <- list(
		sample  = sample,
		tool    = tool,
		library = NA,
		TP      = NA,
		FP      = NA,
		FN      = NA,
		TP_FN   = NA,
		N_truth = NA,
		N_DEL   = NA,
		N_DUP   = NA
	)
	if (startsWith(sample, "GB-WGS")) {res$library = "WGS"} else {res$library = "WES"}
	if (tool == "Cytoscan") {
		bed_file <- sprintf("%s/%s.bed", truthdir, sample)
	} else {
		bed_file <- sprintf("%s/%s/%s.bed", resultsdir, tool, sample)
	}
	if (file.exists(bed_file)) {
		bed <- fread(bed_file, colClasses=list(character = 1))
		if (tool == "Cytoscan") {
			res$N_DEL <- sum(bed$V4 == "DEL")
			res$N_DUP <- sum(bed$V4 == "DUP")
		} else {
			res$N_DEL <- sum(bed$V4 == "DEL" & Inf > bed$V5 & bed$V5 >= 0)
			res$N_DUP <- sum(bed$V4 == "DUP" & Inf > bed$V5 & bed$V5 >= 0)
			truth_file <- sprintf("%s/%s.bed", truthdir, sample)
			if (file.exists(truth_file)) {
				# res$TP <- sum(0 < bed$V5 & bed$V5 <= 1) + sum(bed$V5[1 < bed$V5 & bed$V5 < Inf])
				# res$FN <- sum(bed$V5 == -1)
				# res$FP <- sum(bed$V5 ==  0)
				res$TP <- sum(bed$V5 ==  Inf)
				res$FN <- sum(bed$V5 == -Inf)
				res$FP <- with(res, N_DUP + N_DEL - TP)
				res$TP_FN <- with(res, TP + FN)
				res$N_truth <- R.utils::countLines(truth_file)[1]
			}
		}
	}
	return(res)
}

extract_hist_data <- function(sample, tool, breaks, resultsdir = RESULTSDIR, truthdir = TRUTHDIR) {
	res <- list(
		sample  = sample,
		tool    = tool,
		library = NA,
		bin     = NA,
		counts  = NA
	)
	if (startsWith(sample, "GB-WGS")) {res$library = "WGS"} else {res$library = "WES"}
	if (tool == "Cytoscan") {
		bed_file <- sprintf("%s/%s.bed", truthdir, sample)
	} else {
		bed_file <- sprintf("%s/%s/%s.bed", resultsdir, tool, sample)
	}
	if (file.exists(bed_file)) {
		bed <- fread(bed_file, colClasses=list(character = 1))
		if (tool != "Cytoscan") {
			# bed <- subset(bed, bed$V5 >= 0)
			bed <- subset(bed, Inf > bed$V5 & bed$V5 >= 0)
		}
		bed$width <- bed$V3 - bed$V2
		h <- hist(bed$width, breaks, plot = FALSE)
		res$bin    <- sprintf("%.0f-%.0f", head(breaks, -1), tail(breaks, -1))
		res$counts <- h$counts
	}
	return(res)
}

extract_mlpa_data <- function(sample, tool, resultsdir = RESULTSDIR, truthdir = TRUTHDIR) {
	res <- list(
		sample  = sample,
		tool    = tool,
		library = NA,
		overlap = NA
	)
	if (startsWith(sample, "GB-WGS")) {res$library = "WGS"} else {res$library = "WES"}
	bed_file <- sprintf("%s/%s/%s.bed", resultsdir, tool, sample)
	if (file.exists(bed_file)) {
		bed <- fread(bed_file, colClasses=list(character = 1))
		# if (any(bed$V5 == -1) || all(bed$V5) == 0) {
		# 	res$overlap = 0
		# } else {
		# 	res$overlap <- max(bed$V5)
		# }
		bed <- subset(bed, bed$V5 <= 1)
		res$overlap <- max(bed$V5, 0)
	}
	return(res)
}

extract_benchmarks <- function(sample, tool, benchdir = BENCHMARKDIR) {
	res <- list(
		sample  = sample,
		tool    = tool,
		library = NA,
		cputime = NA,
		peak_memory = NA
	)
	if (startsWith(sample, "GB-WGS")) {res$library = "WGS"} else {res$library = "WES"}
	if (tool == "CLC") {
		input <- sprintf("%s/%s/cpu_time.tsv", benchdir, tool)
		if (file.exists(input)) {
			df <- fread(input)
			df <- subset(df, df$sample == res$sample)
			res$cputime <- as.numeric(hm(df$`elapsed time`))
		}
	} else {
		if (tool %in% c("CNVkit", "CODEX2") || (tool == "cn.MOPS" && res$library == "WES")) {
			input <- Sys.glob(sprintf("%s/%s*/*%s*.txt", benchdir, tool, res$library))
		} else {
			input <- Sys.glob(sprintf("%s/%s*/*%s*.txt", benchdir, tool, sample))
		}
		if (length(input) > 0) {
			df <- do.call("rbind", lapply(input, fread))
			res$cputime     <- sum(df$s)
			res$peak_memory <- max(df$max_rss)
		}
	}
	return(res)
}

## Initialize ---------------------------
# tool_dirs   <- list.dirs(RESULTSDIR, recursive=FALSE, full.names = FALSE)
tool_dirs   <- union(TOOLS$WGS, TOOLS$WES)
cohorts <- fread("../cohorts.txt")
samples <- list(
	all = subset(cohorts, cohorts$cohort != "mlpa")$sample
)
samples$WES <- subset(samples$all, grepl("WES", samples$all))
samples$WGS <- subset(samples$all, grepl("WGS", samples$all))

## Summarize results -----------------------------
message("Creating summary.txt")
l <- ifelse(tool_dirs %in% TOOLS$WGS & tool_dirs %in% TOOLS$WES, list(c(samples$WGS, samples$WES)), ifelse(tool_dirs %in% TOOLS$WES, list(samples$WES), list(samples$WGS)))
map_args <- list(
	sample = do.call("c", l),
	tool   = unlist(mapply(rep, tool_dirs, sapply(l, length)), use.names = FALSE)
)

summary <- flatten_mapply_results(mapply(extract_results, map_args$sample, map_args$tool, SIMPLIFY = FALSE))
summary$precision <- with(summary, TP / (TP + FP))
summary$recall    <- with(summary, TP / (TP + FN))
fwrite(summary, file = "summary.txt", sep = "\t", na = ".", quote = FALSE)


## Make histogram data ---------------------------
message("Creating hist_data.txt")
hist_breaks <- c(0L, 500L, 1000L, 10000L, 100000L, Inf)
hist_data <- flatten_mapply_results(mapply(extract_hist_data, map_args$sample, map_args$tool, MoreArgs = list(breaks = hist_breaks), SIMPLIFY = FALSE))
fwrite(hist_data, file = "hist_data.txt", sep = "\t", na = ".", quote = FALSE)


## Summarize MLPA results ----------------------------
message("Creating mlpa_overlap.txt")
MLPATRUTHDIR <- "truth_set/mlpa"
mlpa_samples <- list.files(MLPATRUTHDIR)

samples$MLPA = subset(cohorts, cohorts$cohort == "mlpa")$sample
samples$MLPA_WES <- subset(samples$MLPA, grepl("WES", samples$MLPA))
samples$MLPA_WGS <- subset(samples$MLPA, grepl("WGS", samples$MLPA))

tool_dirs <- subset(tool_dirs, tool_dirs != "Cytoscan")
l <- ifelse(tool_dirs %in% TOOLS$WGS & tool_dirs %in% TOOLS$WES, list(c(samples$MLPA_WGS, samples$MLPA_WES)), ifelse(tool_dirs %in% TOOLS$WES, list(samples$MLPA_WES), list(samples$MLPA_WGS)))
mlpa_args <- list(
	sample = do.call("c", l),
	tool   = unlist(mapply(rep, tool_dirs, sapply(l, length)), use.names = FALSE)
)

mlpa_overlap    <- flatten_mapply_results(mapply(extract_mlpa_data, mlpa_args$sample, mlpa_args$tool, SIMPLIFY = FALSE))
fwrite(mlpa_overlap, file = "mlpa_overlap.txt", sep = "\t", na = ".", quote = FALSE)


## Summarize computational benchmark results --------------------------
message("Creating benchmarks.txt")
samples$REF = subset(cohorts, cohorts$cohort == "reference")$sample
samples$REF_WES <- subset(samples$REF, grepl("WES", samples$REF))
samples$REF_WGS <- subset(samples$REF, grepl("WGS", samples$REF))

benchmark_args <- list(
	sample = c(rep(samples$REF_WES, length(TOOLS$WES)), rep(samples$REF_WGS, length(TOOLS$WGS))),
	tool   = c(TOOLS$WES, TOOLS$WGS)
)
benchmarks <- flatten_mapply_results(mapply(extract_benchmarks, benchmark_args$sample, benchmark_args$tool, SIMPLIFY = FALSE))
fwrite(benchmarks, file = "benchmarks.txt", sep = "\t", na = ".", quote = FALSE)
24
25
26
27
shell:
	"(module load tools samtools/1.9 bwa/{version}; "
	"bwa mem -M -t {threads} -R {params.rg} {input.ref} {input.reads} > {output.sam}; "
	") 2> {log}"
44
45
shell:
	"(module load tools perl/5.24.0 samtools/1.9 biobambam2/{version}; "
62
63
shell:
	"(module load tools java/1.8.0 gatk/{version}; "
 97
 98
 99
100
101
102
103
104
105
106
107
shell:
	"(module load tools java/1.8.0 samtools/1.9 gatk/{version}; "
	"gatk BaseRecalibrator "
	"--known-sites {input.dbsnp} "
	"--known-sites {input.indels} "
	"--known-sites {input.snps1k} "
	"--known-sites {input.indels1k} "
	"-R {input.ref} "
	"-I {input.bam} "
	"-O {output.bqsr} "
	") 2> {log} "
126
127
128
129
130
131
132
133
shell:
	"(module load tools java/1.8.0 samtools/1.9 gatk/{version}; "
	"gatk ApplyBQSR -R {input.ref} "
	"-bqsr {input.bqsr} "
	"-I {input.bam} "
	"-O {output.bam}; "
	"samtools index -@ {threads} {output.bam} {output.bai}"
	") 2> {log}"
149
150
151
152
153
shell:
	"(module load tools samtools/{version}; "
	"samtools view -C -T {input.ref} -@ {threads} {input.bam} > {output.cram}; "
	"samtools index -@ {threads} {output.cram} {output.crai}; "
	") 2> {log}"
163
164
165
166
shell:
	"cat {input.wes} {input.wgs} | sort -k1,1 -k2,2 -V | uniq > {output.bed}; "
	"cat {input.wes} | sort -k1,1 -k2,2 -V | uniq > {output.wes}; "
	"cat {input.wgs} | sort -k1,1 -k2,2 -V | uniq > {output.wgs}; "
SnakeMake From line 163 of tools/align.smk
174
175
176
shell:
	"module load bedtools/2.27.1; "
	"bedtools subtract -a {input.truth} -b {input.region} > {output.bed}"
192
193
194
195
shell:
	"(module load tools htslib/1.9 mosdepth/0.2.4; "
	"MOSDEPTH_PRECISION=5 mosdepth -f {input.ref} -n --by {input.bed} {params.prefix} {input.cram}; "
	") 2> {log}"
 9
10
11
12
shell:
	"(module load intel/redist/2019_update2 intel/compiler/64/2019_update2 R/3.5.0; "
	"Rscript concordance/scripts/01_filter_results.R"
	") 2> {log}"
20
21
22
23
shell:
	"(module load intel/redist/2019_update2 intel/compiler/64/2019_update2 R/3.5.0; "
	"Rscript concordance/scripts/02_summarize_results.R"
	") 2> {log}"
ShowHide 6 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/cphgeno/CNVbench
Name: cnvbench
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: MIT License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...