Low-Pass Whole Genome Sequencing Copy Number Calling Workflow (LPWGS)

public public 1yr ago 0 bookmarks

This is a snakemake workflow to get copy number calls from low-pass whole genome sequencing. It aligns fastq files, performs some QC and then runs QDNAseq.

Usage

Step 1: Install workflow

If you simply want to use this workflow, download and extract the latest release . If you intend to modify and further extend this workflow or want to work under version control, fork this repository as outlined in Advanced . The latter way is recommended.

Step 2: Configure workflow

Configure the workflow according to your needs via editing the file config.yaml .

Step 3: Execute workflow

Make sure you have snakemake installed on your hpc. If running on apocrita (QMUL hpc) you may need to install it in an environment. eg I run the following to activate an environment with snakemake installed.

source /data/home/hfx042/bin/snakemake/bin/activate

Code Snippets

13
14
15
16
17
18
19
20
21
22
23
24
shell:
    """
    echo "Merging fastq files R1"
    echo "fastq files: "
    echo {input.R1}
    cat {input.R1} > {output.tempfastqR1}
    echo "Merging fastq files R2"
    echo "fastq files: "
    echo {input.R2}
    cat {input.R2} > {output.tempfastqR2}
    echo "Finished merging fastqfiles"
    """
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
shell:
    """
    module load bwa/0.7.15
    module load samtools/1.8

    bwa mem -M -t {threads} \
    {params.genome} \
    {input.tempfastqR1} {input.tempfastqR2} | \
    samtools view -S -b - > {output}.temp.bam

    module load java
    java -jar -Xmx2G {params.picard} AddOrReplaceReadGroups \
       I={output}.temp.bam \
       O={output} \
       RGID={wildcards.sample} \
       RGLB={wildcards.sample} \
       RGPL=ILLUMINA \
       RGSM={wildcards.sample} \
       RGPU={wildcards.sample}

    rm {input.tempfastqR1}
    rm {input.tempfastqR2}
    rm {output}.temp.bam
    """
68
69
70
71
72
73
74
75
shell:
    """
    module load java
    java -jar -Xmx4G {params.picard} SortSam \
        INPUT={input.bam} \
        OUTPUT={output.bam} \
        SORT_ORDER=coordinate
    """
86
87
88
89
90
91
92
93
94
95
96
97
98
    shell:
        """
        module load java
        java -jar -Xmx4G {params.picard} MarkDuplicates \
            INPUT={input.bam} \
            OUTPUT={output.bam} \
            METRICS_FILE={output.metrics}.temp \
            CREATE_INDEX=true \
            REMOVE_DUPLICATES=true

	    grep -A2  "## METRICS" {output.metrics}.temp | tail -n +1 > {output.metrics}
        rm {output.metrics}.temp
        """
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
shell:
    """
    module load R
    module load singularity

    singularity exec {params.singularityimage} \
    Rscript {params.pipelinedirectory}/scripts/QDNAseq.R \
        --bamfiles {input.bams} \
        --binsize {params.binsize} \
        --plotdir {output.plotdir} \
        --Rdata {output.Rdata} \
        --segmentfile {output.segmentfile} \
        --pipelinedirectory {params.pipelinedirectory} \
        --filter ""
    """
13
14
15
16
17
18
19
shell:
    """
    echo "Loading fastQC"
    module load fastqc
    fastqc {input.R1} -o {output}
    fastqc {input.R2} -o {output}
    """
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
shell:
    """
    longLine="--------------------"
    module load java
    module load R

    msg="Run picard CollectWGSMetrics"; echo "-- $msg $longLine"; >&2 echo "-- $msg $longLine"

    java -jar -Xmx4G {params.picard} CollectWgsMetrics \
        INPUT={input.bam} \
        OUTPUT={output.metricsWGS}.temp \
        R={params.genome}
    grep -A2  "## METRICS" {output.metricsWGS}.temp | tail -n +1 > {output.metricsWGS}
    rm {output.metricsWGS}.temp


    msg="Run picard insertsize metrics"; echo "-- $msg $longLine"; >&2 echo "-- $msg $longLine"
    java -jar -Xmx4G {params.picard} CollectInsertSizeMetrics \
        INPUT={input.bam} \
        OUTPUT={output.metricsInsert}.temp \
        H={output.metricsInsert}.pdf
    grep -A2  "## METRICS" {output.metricsInsert}.temp | tail -n +1 > {output.metricsInsert}
    rm {output.metricsInsert}.temp

    msg="Run picard CollectAlignmentSummaryMetrics"; echo "-- $msg $longLine"; >&2 echo "-- $msg $longLine"
    java -jar -Xmx4G {params.picard} CollectAlignmentSummaryMetrics \
        INPUT={input.bam} \
        OUTPUT={output.metricsAlign}.temp \
        ADAPTER_SEQUENCE=[CTGTCTCTTATA,TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG,GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG,AGATCGGAAGAGC,ACGCTCTTCCGATCT] \
        R={params.genome}
    grep -A2  "## METRICS" {output.metricsAlign}.temp | tail -n +1 > {output.metricsAlign}
    rm {output.metricsAlign}.temp

    msg="Run picard QualityScoreDistribution"; echo "-- $msg $longLine"; >&2 echo "-- $msg $longLine"
    java -jar -Xmx4G {params.picard} QualityScoreDistribution \
        INPUT={input.bam} \
        OUTPUT={output.metricsQS} \
        CHART={output.metricsQS}.pdf

    """
66
67
68
69
70
71
72
73
74
75
76
shell:
    """
    module load singularity
    singularity exec {params.singularityimage} \
    Rscript {params.pipelinedirectory}/scripts/combineQC.R \
        --WGS {input.metricsWGS} \
        --insertsize {input.metricsInsert} \
        --align {input.metricsAlign} \
        --dedup {input.metricsDedup} \
        --output {output}
    """
13
14
15
16
17
18
19
20
21
22
23
24
25
shell:
    """
    #mkdir {output.plotdir}
    module load singularity
    singularity exec {params.singularityimage} \
    Rscript {params.pipelinedirectory}/scripts/report.R \
        --QC {input.QC} \
        --output {output.report} \
        --CNA {input.CNA} \
        --plotdir {output.plotdir} \
        --pipelinedirectory {params.pipelinedirectory} \
        --readscutoff {params.readscutoff}
    """
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
library(dplyr)
library(readr)
library(tidyr)
library(stringr)
library(argparse)

parser <- ArgumentParser(description = "Parse arguments to combine QC metrics")
parser$add_argument('--WGS', type = 'character',
                    help="List of WGS file metrics", default = NULL, nargs = "+")
parser$add_argument('--insertsize', type = 'character',
                    help="List of WGS file metrics", default = NULL, nargs = "+")
parser$add_argument('--align', type = 'character',
                    help="List of WGS file metrics", default = NULL, nargs = "+")
parser$add_argument('--dedup', type = 'character',
                    help="List of WGS duplication metrics", default = NULL, nargs = "+")
parser$add_argument('--output', type = 'character',
                    help="List of WGS file metrics")
args <- parser$parse_args()

dfWGS <- data.frame()
for (file in args$WGS){
  print(file)
  metrics <- read.table(file, sep="\t" , header=T)
  samplename = strsplit(basename(file),"[.]")[[1]][1]
  dfWGS <- rbind(dfWGS, metrics %>% mutate(samplename = samplename))
}
print(dfWGS)

dfinsertsize <- data.frame()
for (file in args$insertsize){
  print(file)
  metrics <- read.table(file, sep="\t" , header=T)
  samplename = strsplit(basename(file),"[.]")[[1]][1]
  dfinsertsize <- rbind(dfinsertsize, metrics %>% mutate(samplename = samplename))
}
print(dfinsertsize)

dfalign <- data.frame()
for (file in args$align){
  print(file)
  metrics <- read.table(file, sep="\t" , header=T)
  samplename = strsplit(basename(file),"[.]")[[1]][1]
  dfalign <- rbind(dfalign, metrics %>% mutate(samplename = samplename))
}
print(dfalign)

dfdedup <- data.frame()
for (file in args$dedup){
  print(file)
  metrics <- read.table(file, sep="\t" , header=T)
  samplename = strsplit(basename(file),"[.]")[[1]][1]
  dfdedup <- rbind(dfdedup, metrics %>% mutate(samplename = samplename))
}
print(dfdedup)

#df <- bind_cols(dfWGS, dfinsertsize, dfalign, dfdedup)
df <- dfWGS %>%
    left_join(., dfinsertsize) %>%
    left_join(., dfalign) %>%
    left_join(., dfdedup %>% select(-LIBRARY))

write_csv(df, args$output)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
library(QDNAseq)
library(methods)
library(dplyr)
library(argparse)
library(stringr)

parser <- ArgumentParser(description = "Parse arguments for QDNAseq analysis")
parser$add_argument('--binsize', type = 'double',
                    help="Binsize for QDNAseq run", default = 500)
parser$add_argument('--bamfiles', type = 'character',
                    help="List of bam files to process", default = NULL, nargs = "+")
parser$add_argument('--plotdir', type = 'character',
                    help="Plotting directory", default = NULL)
parser$add_argument('--Rdata', type = 'character',
                    help="Rdata file", default = NULL)
parser$add_argument('--segmentfile', type = 'character',
                    help="Text file for segments", default = NULL)
parser$add_argument('--pipelinedirectory', type = 'character',
                    help="Pipeline directory", default = NULL)
parser$add_argument('--filter', type = 'character',nargs = "+",
                    help="Text file for segments", default = NULL)
args <- parser$parse_args()

source(paste0(args$pipelinedirectory, "/scripts/blacklist.R"))
#outdir <- dirname(args$Rdata)

#directory to store plots
#plotdir <- paste0(args$plotdir, "binsize", as.numeric(binsize))
plotdir <- args$plotdir
print(plotdir)

print(paste0("Number of bam files: ", length(args$bamfiles)))

if (dir.exists(plotdir) == FALSE){
  dir.create(plotdir, recursive = T)
}

args$filter <- paste0(gsub(",", "[0-9]+|", args$filter), "[0-9]+")
print(args$filter)
print(str_detect(args$bamfiles, args$filter))

args$bamfiles <- args$bamfiles[str_detect(args$bamfiles, args$filter)]
print(args$bamfiles)

basenames <- basename(args$bamfiles)
basenames <- sub(sprintf('[\\.]?%s$', "bam"), '', basenames)
bamnames <- c()
bamnames <- basenames
#for (i in basenames){
#    x <-strsplit(i, "-")[[1]]
#    bamnames <- c(bamnames, x[length(x)])
#}
print(bamnames)

#download bin annotations
bins <- getBinAnnotations(binSize = as.numeric(args$binsize))
binsnew <- bins

#remove bins in the following regions
regions <- cbind(c(6, 4, 17), c(28500001, 69200001, 43900000), c(33500000, 69300000, 44100001))
binsnew$blacklist <- calculateBlacklistByRegions(binsnew, regions)

bins@data <- left_join(bins@data, binsnew@data,
                       by = c("chromosome", "start", "end", "bases", "gc", "mappability", "residual", "use")) %>%
             mutate(blacklist = blacklist.x,
               blacklist = ifelse(blacklist.y > 0, blacklist.y, blacklist)) %>%
            select(-blacklist.x, -blacklist.y)

#load sequencing data
readCounts <- binReadCounts(bins, bamfiles=args$bamfiles, bamnames = bamnames, pairedEnds = TRUE)

#plot raw readcounts
pdf(paste(plotdir,"/","raw_profile",".pdf",sep=""),7,4)
plot(readCounts, logTransform=FALSE, ylim=c(-10, 50),main=paste("Raw Profile ",sep=""))
highlightFilters(readCounts, logTransform=FALSE,residual=TRUE, blacklist=TRUE)
dev.off()

#apply filters and plot median read counts per bin as a function of GC content and mappability
readCountsFiltered <- applyFilters(readCounts, residual=TRUE, blacklist=TRUE)
pdf(paste(plotdir,"/","isobar",".pdf",sep=""),7,4)
isobarPlot(readCountsFiltered)
dev.off()

#Estimate the correction for GC content and mappability, and make a plot for the relationship between the
#observed standard deviation in the data and its read depth
readCountsFiltered <- estimateCorrection(readCountsFiltered)
pdf(paste(plotdir,"/","noise",".pdf",sep=""),7,4)
noisePlot(readCountsFiltered)
dev.off()

#apply the correction for GC content and mappability which we then normalize, smooth outliers, calculate segmentation
#and plot the copy number profile
copyNumbers <- correctBins(readCountsFiltered)
copyNumbersNormalized <- normalizeBins(copyNumbers)
copyNumbersSmooth <- smoothOutlierBins(copyNumbersNormalized)

pdf(paste(plotdir,"/","copy_number_profile",".pdf",sep=""),7,4)
plot(copyNumbersSmooth, ylim=c(-2,2))
dev.off()

copyNumbersSegmented <- segmentBins(copyNumbersSmooth)
print("bins segmented")
copyNumbersSegmented <- normalizeSegmentedBins(copyNumbersSegmented)

pdf(paste(plotdir,"/","segments",".pdf",sep=""),7,4)
plot(copyNumbersSegmented, ylim=c(-2,2), gaincol = "firebrick3", losscol = "dodgerblue3", delcol="dodgerblue4", ampcol = "firebrick4")
dev.off()

copyNumbersCalled <- callBins(copyNumbersSegmented)
pdf(paste(plotdir,"/","copy_numbercalls",".pdf",sep=""),7,4)
plot(copyNumbersCalled, ylim=c(-2,2), gaincol = "firebrick3", losscol = "dodgerblue3", delcol="dodgerblue4", ampcol = "firebrick4")
dev.off()

output <- list(readCounts = readCounts,
               readCountsFiltered = readCountsFiltered,
               copyNumbers = copyNumbers,
               copyNumbersNormalized = copyNumbersNormalized,
               copyNumbersSmooth = copyNumbersSmooth,
               copyNumbersSegmented = copyNumbersSegmented,
               copyNumbersCalled = copyNumbersCalled)
saveRDS(output, file = args$Rdata)

exportBins(copyNumbersSmooth, file=args$segmentfile)

pdf(paste(plotdir,"/","frequencyplot",".pdf",sep=""),7,4)
frequencyPlot(copyNumbersCalled, gaincol = "firebrick3", losscol = "dodgerblue3", delcol="dodgerblue4", ampcol = "firebrick4")
dev.off()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
library(tidyverse)
library(rmarkdown)
library(argparse)
library(cowplot)
parser <- ArgumentParser(description = "Parse arguments for QC metrics")
parser$add_argument('--output', type = 'character',
                    help="Output html file")
parser$add_argument('--QC', type = 'character',
                    help="QC file")
parser$add_argument('--CNA', type = 'character',
                    help="CNA file")
parser$add_argument('--plotdir', type = 'character',
                    help="Plotting directory")
parser$add_argument('--pipelinedirectory', type = 'character',
                    help="Pipeline directory", default = NULL)
parser$add_argument('--readscutoff', type = 'character',
                    help="Cut off to exclude reads from samples")
args <- parser$parse_args()
print(args)

dfQC <- read_csv(args$QC)
CNA <- readRDS(args$CNA)
print(dfQC)
print(typeof(args$readscutoff))

render(paste0(args$pipelinedirectory, "/scripts/report-basic.Rmd"),
    "html_document",
    output_dir = dirname(args$output),
    output_file = basename(args$output),
    intermediates_dir = dirname(args$output),
    clean = FALSE,
    params = list(dfQC = dfQC, CNA = CNA, plotdir = args$plotdir, readscutoff = args$readscutoff))
ShowHide 6 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/marcjwilliams1/lpWGS
Name: lpwgs
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: MIT License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...