CNV-- Structural Variants pipeline

public public 1yr ago Version: 1.1.3 0 bookmarks

StructuralVariants Workflow

A CNV (Copy Number Variation) pipeline is a set of bioinformatics tools and algorithms used to detect and analyze genomic variations in the number of copies of DNA segments between individuals or populations.

Code Snippets

20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
baseCommand: [Rscript]

inputs:
  input:
    type: 'File[]'
    inputBinding:
      position: 2
      itemSeparator: ','
  samples:
    type: File
    inputBinding:
      position: 3
  script:
    type: File
    inputBinding:
      position: 1
    default:
        class: File
        basename: "batch_parser.R"
        contents: |-
          args <- commandArgs(TRUE)

          bamsList <- as.vector(strsplit(args[1], ",")[[1]])
          bamsList <- sapply(strsplit(bamsList, "[/]"), "[[", 3)
          samplesFile <- read.delim(args[2], header = T, sep = " ")

          bams <- data.frame(unlist(bamsList))
          colnames(bams)[1] <- "file"
          samples <- samplesFile

          indx <- sapply(samples$sample_id, grep, bams$file)
          temp_df <- cbind(samples[unlist(indx), , drop = F], bams[unlist(indx), , drop = F])
          temp_df$index_file <- paste(temp_df$file, ".bai", sep = "")
          output <- split(temp_df, temp_df$batch)

          sapply(names(output), function(x)
            write.table(output[[x]], file = paste0("bams_x_batch_", x, ".csv"), sep = ",", row.names = F))
19
20
21
22
23
24
25
baseCommand: [bedops, -u]

inputs:
  input:
    type: 'File[]'
    inputBinding:
      position: 1
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
baseCommand: []
arguments:
  - position: 0
    shellQuote: false
    valueFrom: >-
      $(inputs.input_amb && inputs.input_ann && inputs.input_bwt && inputs.input_pac && inputs.input_sa ? 'echo bwa' : inputs.generate_bwa_indexes ? 'bwa index' : 'echo bwa')

inputs:
  generate_bwa_indexes:
    type: boolean?
  input_fasta:
    type: File
    inputBinding:
      position: 2
  input_amb:
    type: File?
  input_ann:
    type: File?
  input_bwt:
    type: File?
  input_pac:
    type: File?
  input_sa:
    type: File?
  algoType:
    type: string?
    inputBinding:
      position: 1
      prefix: '-a'
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
baseCommand: [bwa, mem]

inputs:
  reads:
    type: 'File[]'
    inputBinding:
      position: 3
  reference_genome:
    type: File
    inputBinding:
      position: 2
    secondaryFiles:
      - .fai
      - .amb
      - .ann
      - .bwt
      - .pac
      - .sa
  threads:
    type: int?
    default: 1
    inputBinding:
      position: 1
      prefix: '-t'
  read_group:
    type: string?
    default: '@RG\tID:SRR709972\tSM:NA06985\tPL:ILLUMINA\tCN:CBRA\tLB:Fragment'
    inputBinding:
      position: 4
      prefix: '-R'
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
baseCommand: [bwa, mem]

inputs:
  reads:
    type: File
    inputBinding:
      position: 3
  reference_genome:
    type: File
    inputBinding:
      position: 2
    secondaryFiles:
      - .fai
      - .amb
      - .ann
      - .bwt
      - .pac
      - .sa
  threads:
    type: int?
    default: 1
    inputBinding:
      position: 1
      prefix: '-t'
  read_group:
    type: string?
    default: '@RG\tID:SRR709972\tSM:NA06985\tPL:ILLUMINA\tCN:CBRA\tLB:Fragment'
    inputBinding:
      position: 4
      prefix: '-R'
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
baseCommand: [Rscript]

inputs:
  input:
    type: 'File[]'
    inputBinding:
      position: 2
      itemSeparator: ','
  mapping:
    type: File
    inputBinding:
      position: 3
  bed:
    type: File
    inputBinding:
      position: 4
  script:
    type: File
    inputBinding:
      position: 1
    default:
        class: File
        basename: "CODEX2.R"
        contents: |-
          # load packages
          library(CODEX2)

          # parse arguments
          args <- commandArgs(TRUE)
          args2 <- args[2]

          bamsList <- as.vector(strsplit(args[1], ",")[[1]])
          bamsList <- sapply(strsplit(bamsList, "[/]"), "[[", 3)

          bams <- read.csv(args2, header = TRUE, sep = ",") # mapping file
          bedFile <- args[3]

          # mapping
          dirPath <- dirname(args2)
          bamsMap <- bams[bams$file %in% bamsList,]
          batch <- bamsMap$batch[[1]]
          batchname <- paste0("batch_", batch)

          # set global variables
          bamFile <- as.vector(bamsMap$file)
          bamdir <- file.path(dirPath, bamFile)
          sampname <- sapply(strsplit(bamFile, "[.]"), "[[", 1)

          bambedObj <- getbambed(bamdir = bamdir, bedFile = bedFile, sampname = sampname, projectname = batchname)

          bamdir <- bambedObj$bamdir
          sampname <- bambedObj$sampname
          ref <- bambedObj$ref
          projectname <- bambedObj$projectname

          nsamples <- length(bamFile)

          ##########################################################
          # Getting GC content and mappability
          ##########################################################
          gc <- getgc(ref)
          mapp <- getmapp(ref)

          ##########################################################
          # Getting gene names, needed for targeted sequencing, here generating gene names in silico
          ##########################################################
          gene <- rep(NA, length(ref))
          for (chr in as.matrix(unique(seqnames(ref)))) {
            chr.index <- which(seqnames(ref) == chr)
            gene[chr.index] <- paste0(chr, "_gene_", ceiling(chr.index / 30))
          }
          values(ref) <- cbind(values(ref), DataFrame(gc, mapp, gene))

          ##########################################################
          # Getting depth of coverage
          ##########################################################
          coverageObj <- getcoverage(bambedObj, mapqthres = 20)
          Y <- coverageObj$Y
          write.csv(Y, file = paste0(projectname, "_coverage.csv"), quote = FALSE)
          head(Y[, 1:nsamples])

          ##########################################################
          # Quality control
          ##########################################################
          qcObj <- qc(Y, sampname, ref, cov_thresh = c(20, Inf), length_thresh = c(20, Inf), mapp_thresh = 0.9, gc_thresh = c(20, 80))
          Y_qc <- qcObj$Y_qc
          sampname_qc <- qcObj$sampname_qc
          ref_qc <- qcObj$ref_qc
          qcmat <- qcObj$qcmat
          gc_qc <- ref_qc$gc
          write.table(qcmat, file = paste0(projectname, "_qcmat", ".txt"), sep = "\t", quote = FALSE, row.names = FALSE)

          ##########################################################
          # Estimating library size factor for each sample
          ##########################################################
          Y.nonzero <- Y_qc[apply(Y_qc, 1, function(x) { !any(x == 0) }), ]
          pseudo.sample <- apply(Y.nonzero, 1, function(x) { prod(x) ^ (1 / length(x)) })
          N <- apply(apply(Y.nonzero, 2, function(x) { x / pseudo.sample }), 2, median)
          plot(N, apply(Y, 2, sum), xlab = "Estimated library size factor", ylab = "Total sum of reads")

          ##########################################################
          # Genome-wide normalization using normalize_null
          ##########################################################
          # If there are negative control samples, use normalize_codex2_ns()
          # If there are negative control regions, use normalize_codex2_nr()
          normObj.null <- normalize_null(Y_qc = Y_qc, gc_qc = gc_qc, K = 1:nsamples, N = N)
          Yhat <- normObj.null$Yhat
          AIC <- normObj.null$AIC
          BIC <- normObj.null$BIC
          RSS <- normObj.null$RSS

          ##########################################################
          # Number of latent factors
          ##########################################################
          choiceofK(AIC, BIC, RSS, K = 1:nsamples , filename = "codex2_null_choiceofK.pdf")
          par(mfrow = c(1, 3))
          plot(1:nsamples, RSS, type = "b", xlab = "Number of latent variables", pch=20)
          plot(1:nsamples, AIC, type = "b", xlab = "Number of latent variables", pch=20)
          plot(1:nsamples, BIC, type = "b", xlab = "Number of latent variables", pch=20)
          par(mfrow = c(1,1))

          ##########################################################
          # CBS segmentation per chromosome: optimal for WGS and WES
          ##########################################################
          finalcall.CBS <- list()
          optK <- which.max(BIC)
          for (chr in unique(seqnames(ref_qc))) {
            chr.index <- which(seqnames(ref_qc) == chr)
            Yhat.null.chr <- lapply(Yhat, function(x) x[chr.index,])

            finalcall.CBS[[chr]] <- segmentCBS(Y_qc[chr.index,], Yhat = Yhat.null.chr, optK = optK, K = 1:nsamples, sampname_qc = sampname_qc, ref_qc = ranges(ref_qc)[chr.index], chr = chr, lmax = 400, mode = "integer")
          }
          # write results
          write.table(finalcall.CBS[[chr]], file = paste0(projectname, "_", optK, ".CODEX_Chrom.txt"), sep = "\t", quote = F, row.names = F)
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
baseCommand: [bash, -c]

inputs:
  script:
    type: string?
    inputBinding:
      position: 1
    default: |
      INPUT_FILE=$0
      SAMPLE_FILE=$1
      MIN_LEN=$2
      MAX_LEN=$3
      MIN_LRATIO=$4
      TEMP_NAME=${INPUT_FILE/_Chrom.txt/.filtered.txt}
      TEMP_FILE=$(echo "$TEMP_NAME" | sed "s/.*\///")
      OUTPUT_NAME=${TEMP_FILE/txt/bed}
      OUTPUT_FILE=$(echo "$OUTPUT_NAME" | sed "s/.*\///")

      tail -n +2 $INPUT_FILE | awk -v maxLen=$MAX_LEN -v minLen=$MIN_LEN -v minLRatio=$MIN_LRATIO '{ \
      chr=$2; \
      start=$4; \
      end=$5; \
      sample=$1; \
      len=$6; \
      lratio=$12; \
      st_exon=$7; \
      ed_exon=$8; \
      type=$3=="del"?"DEL":"DUP"; \
      tool="codex"; \
      if(len>minLen && \
         len<maxLen && \
         len/(ed_exon-st_exon)<50 && \
         lratio>=minLRatio){
           print chr"\t"start"\t"end"\t"sample"\t"type"\t"lratio"\t"tool}}' > ${TEMP_FILE}

      # mapping case_id with sample_id
      join -1 4 -2 2 -o 1.1,1.2,1.3,2.1,1.5,1.6,1.7 <(sort -k 4 $TEMP_FILE) <(sort -k 2 $SAMPLE_FILE) > ${OUTPUT_FILE}

  input:
    type: File
    inputBinding:
      position: 2
  samples:
    type: File
    inputBinding:
      position: 3
  min_len:
    type: string?
    inputBinding:
      position: 4
  max_len:
    type: string?
    inputBinding:
      position: 5
  min_lratio:
    type: string?
    inputBinding:
      position: 6
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
baseCommand: [bash, -c]

inputs:
  script:
    type: string?
    inputBinding:
      position: 1
    default: |
      INPUT_FILE=$0
      OUTPUT_NAME=${INPUT_FILE/bed/sorted.merged.bed}
      OUTPUT_FILE=$(echo "$OUTPUT_NAME" | sed "s/.*\///")

      for i in $(cat $INPUT_FILE | sed -e "s/[[:space:]]\+/\t/g" | cut -f4 | sort -u)  # FIXME: remove sed -e "s/[[:space:]]\+/\t/g" try better solution
      do
        for j in "DEL" "DUP"
        do
          grep $i $INPUT_FILE | grep $j | sed -e "s/[[:space:]]\+/\t/g" | bedtools sort | bedtools merge -c 6,7 -o max,distinct | awk -v sample=$i -v type=$j '{ \
            print $1,$2,$3,sample,type,$4,$5}' OFS="\t"
        done >> ${OUTPUT_FILE}
      done
  input:
    type: File
    inputBinding:
      position: 2
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
baseCommand: [Rscript]

inputs:
  input:
    type: 'File[]'
    inputBinding:
      position: 2
      itemSeparator: ','
  mapping:
    type: File
    inputBinding:
      position: 3
  reference_genome:
    type: File
    inputBinding:
      position: 4
  regions:
    type: File
    inputBinding:
      position: 5
  script:
    type: File
    inputBinding:
      position: 1
    default:
        class: File
        basename: "exomeDepth.R"
        contents: |-
          # load packages
          library(ExomeDepth)

          # parse arguments
          args <- commandArgs(TRUE)
          args2 <- args[2]

          bamsList <- as.vector(strsplit(args[1], ",")[[1]])
          bamsList <- sapply(strsplit(bamsList, "[/]"), "[[", 3)

          bams <- read.csv(args2, header = TRUE, sep = ",") # mapping file
          bed <- read.csv(args[4], sep = "\t")
          refgen <- args[3]

          # mapping
          dirPath <- dirname(args2)
          bamsMap <- bams[bams$file %in% bamsList,]
          batch <- bamsMap$batch[[1]]

          # set global variables
          bamFiles <- as.vector(bamsMap$file)
          baiFiles <- as.vector(bamsMap$index_file)
          bamdir <- file.path(dirPath, bamFiles)
          baidir <- file.path(dirPath, baiFiles)

          # prepare bed file (4th column is the annotation for each region)
          bed <- bed[,1:4]
          colnames(bed) <- c("chromosome", "start", "end", "name")

          # get counts per bed region
          message('\n[INFO] Creating counts matrix')
          my.counts <- getBamCounts(bed.frame = bed, bam.files = bamdir, include.chr = FALSE, min.mapq = 20, referenceFasta = refgen, index.files = baidir)
          ExomeCount.dafr <- as(my.counts, "data.frame")

          # prepare the main matrix of read count data
          ExomeCount.mat <- as.matrix(ExomeCount.dafr[, grep(names(ExomeCount.dafr), pattern = '*.bam')])
          nsamples <- ncol(ExomeCount.mat)

          results <- data.frame(NULL) # to write results

          # start looping over each sample
          for (i in 1:nsamples) {

            sampname <- sapply(strsplit(bamsList[[i]], "[.]"), "[[", 1) # get sample name
            message(paste0("\n[INFO] Creating reference set for ", bamsList[[i]]))

            # Create the aggregate reference set for this sample
            my.choice <- select.reference.set(test.counts = ExomeCount.mat[,i],
                                              reference.counts = ExomeCount.mat[,-i],
                                              bin.length = (ExomeCount.dafr$end - ExomeCount.dafr$start)/1000,
                                              n.bins.reduced = 10000)

            my.reference.selected <- apply(X = ExomeCount.mat[, my.choice$reference.choice, drop = FALSE],
                                          MAR = 1,
                                          FUN = sum)

            message('\n[INFO] Computing correlation between sample and references...\n')
            my.test = ExomeCount.mat[, i, drop = FALSE]
            my.ref.counts = ExomeCount.mat[, my.choice$reference.choice, drop = FALSE]
            correlation = cor(my.test[,1], apply(my.ref.counts, 1, mean))
            message(paste("\n[INFO] Correlation between reference and tests count is", round(correlation,4)))

            message('\n[INFO] Creating the ExomeDepth object')
            all.exons <- new('ExomeDepth',
                            test = ExomeCount.mat[,i],
                            reference = my.reference.selected,
                            formula = 'cbind(test, reference) ~ 1')

            # Call the CNVs
            message('\n[INFO] Calling CNVs')
            all.exons <- CallCNVs(x = all.exons,
                                  transition.probability = 10^-4,
                                  chromosome = ExomeCount.dafr$space,
                                  start = ExomeCount.dafr$start,
                                  end = ExomeCount.dafr$end,
                                  name = bed$name)

            [email protected]$sample_id <- sampname # set sample name
            results <- rbind(results, [email protected])  # append to results

          }
          message('\n[INFO] Writting results')
          output.file <- paste0("batch_", batch, ".exomeDepth.csv")
          write.csv(file = output.file, x = results, row.names = FALSE)
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
baseCommand: [bash, -c]

inputs:
  script:
    type: string?
    inputBinding:
      position: 1
    default: |
      INPUT_FILE=$0
      SAMPLE_FILE=$1
      MIN_LEN=$2
      MAX_LEN=$3
      MIN_BF=$4
      TEMP_NAME=${INPUT_FILE/csv/filtered.csv}
      TEMP_FILE=$(echo "$TEMP_NAME" | sed "s/.*\///")
      OUTPUT_NAME=${TEMP_FILE/csv/bed}
      OUTPUT_FILE=$(echo "$OUTPUT_NAME" | sed "s/.*\///")

      tail -n +2 $INPUT_FILE | sed 's/"//g' | awk -F"," -v maxLen=$MAX_LEN -v minLen=$MIN_LEN -v minBf=$MIN_BF '{ \
      chr=$7; \
      start=$5; \
      end=$6; \
      sample=$13; \
      bf=$9; \
      len=(end-start)/1000; \
      type=$3=="deletion"?"DEL":"DUP"; \
      tool="exomeDepth"; \
      if(len>minLen && \
         len<maxLen && \
         bf>=minBf){
           print chr"\t"start"\t"end"\t"sample"\t"type"\t"bf"\t"tool}}' > ${TEMP_FILE}

      # mapping case_id with sample_id
      join -1 4 -2 2 -o 1.1,1.2,1.3,2.1,1.5,1.6,1.7 <(sort -k 4 $TEMP_FILE) <(sort -k 2 $SAMPLE_FILE) > ${OUTPUT_FILE}

  input:
    type: File
    inputBinding:
      position: 2
  samples:
    type: File
    inputBinding:
      position: 3
  min_len:
    type: string?
    inputBinding:
      position: 4
  max_len:
    type: string?
    inputBinding:
      position: 5
  min_bf:
    type: string?
    inputBinding:
      position: 6
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
baseCommand: [fastp]

arguments:
  - prefix: '-o'
    valueFrom: $(inputs.fastq1.nameroot.replace(/\b.fastq\b/g, '')).trimmed.fastq
  - |
    ${
      if (inputs.fastq2){
        return '-O';
      } else {
        return '';
      }
    }
  - |
    ${
      if (inputs.fastq2){
        return inputs.fastq2.nameroot.replace(/\b.fastq\b/g, '') + '.trimmed.fastq';
      } else {
        return '';
      }
    }
  - prefix: '--unpaired1'
    valueFrom: $(inputs.fastq1.nameroot.replace(/\b.fastq\b/g, '')).unpaired.trimmed.fastq
  - |
    ${
      if (inputs.fastq2){
        return '--unpaired2';
      } else {
        return '';
      }
    }
  - |
    ${
      if (inputs.fastq2){
        return inputs.fastq2.nameroot.replace(/\b.fastq\b/g, '') + ".unpaired.trimmed.fastq";
      } else {
        return '';
      }
    }

inputs:
  fastq1:
    type: File
    inputBinding:
      prefix: -i
  fastq2:
    type: File
    inputBinding:
      prefix: -I
  threads:
    type: int?
    default: 1
    inputBinding:
      position: 1
      prefix: '--thread'
  cut_right:
    type: boolean?
    default: true
    inputBinding:
      prefix: '--cut_right'
  cut_right_window_size:
    type: int?
    default: 5
    inputBinding:
      prefix: '--cut_right_window_size'
  cut_right_mean_quality:
    type: int?
    default: 24
    inputBinding:
      prefix: '--cut_right_mean_quality'
  trim_tail1:
    type: int?
    default: 1
    inputBinding:
      prefix: '--trim_tail1'
  length_required:
    type: int?
    default: 70
    inputBinding:
      prefix: '--length_required'
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
baseCommand: [fastqc]

arguments:
  - valueFrom: $(runtime.outdir)
    prefix: '-o'
  - valueFrom: '--noextract'

inputs:
  fastq1:
    type: 'File[]'
    inputBinding:
      position: 2
  fastq2:
    type: 'File[]'
    inputBinding:
      position: 3
  threads:
    type: int?
    default: 1
    inputBinding:
      position: 1
      prefix: '--threads'
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
baseCommand: [gridss]

arguments:
  - position: 2
    prefix: '--output'
    valueFrom: $(inputs.input.nameroot.split('.')[0]).gridss.raw.vcf.gz
  - position: 5
    prefix: '--jar'
    valueFrom: "/usr/local/share/gridss-2.9.3-0/gridss.jar"

inputs:
  input:
    type: File
    secondaryFiles:
      - .bai
    inputBinding:
      position: 7
  reference_genome:
    type: File
    inputBinding:
      prefix: '--reference'
      position: 1
  assemblyFilename:
    type:
      - string?
      - "null"
    default: "gridss.assembly.bam"
    inputBinding:
      prefix: '--assembly'
      position: 3
  threads:
    type: int?
    default: 1
    inputBinding:
      prefix: '--threads'
      position: 4
  blacklist:
    type: File?
    inputBinding:
      prefix: '--blacklist'
      position: 6
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
baseCommand: [bash, -c]

inputs:
  script:
    type: string?
    inputBinding:
      position: 1
    default: |
      INPUT_FILE=$0
      SAMPLE_FILE=$1
      MIN_LEN=$2
      MAX_LEN=$3
      MIN_Q=$4
      OUTPUT_NAME=${INPUT_FILE/raw/filtered}
      OUTPUT_FILE=$(echo "$OUTPUT_NAME" | sed "s/.*\///")

      # mapping case_id with sample_id
      SAMPLE_ID=(${OUTPUT_FILE//./ })
      CASE_ID=$(awk -v search="$SAMPLE_ID" '$0 ~ search{print $1}' "$SAMPLE_FILE")

      tail -n +2 $INPUT_FILE | awk -v maxLen=$MAX_LEN -v minLen=$MIN_LEN -v minQ=$MIN_Q '{ \
      chr=$1; \
      start=$2; \
      end=$6; \
      sample="'$CASE_ID'"; \
      q=$11; \
      len=(end-start)/1000; \
      type=$7; \
      tool="gridss"; \
      if(len>minLen && \
         len<maxLen && \
         q>=minQ && \
         (type=="DEL" || type=="DUP")){
           print chr"\t"start"\t"end"\t"sample"\t"type"\t"q"\t"tool}}' > ${OUTPUT_FILE}

  input:
    type: File
    inputBinding:
      position: 2
  samples:
    type: File
    inputBinding:
      position: 3
  min_len:
    type: string?
    inputBinding:
      position: 4
  max_len:
    type: string?
    inputBinding:
      position: 5
  min_q:
    type: string?
    inputBinding:
      position: 6
17
18
19
20
21
22
23
24
25
26
27
28
29
baseCommand: []
arguments:
  - position: 0
    shellQuote: false
    valueFrom: >-
      $(inputs.input_fasta.nameext == '.gz' ? 'gunzip -c' : 'echo gunzip')


inputs:
  input_fasta:
    type: File
    inputBinding:
      position: 2
CWL From line 17 of tools/gunzip.cwl
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
arguments:
  - position: 0
    valueFrom: "/usr/local/share/manta-1.6.0-0/bin/configManta.py"
    shellQuote: false
  - position: 2
    valueFrom: $(";{runDir}/runWorkflow.py".replace(/\{runDir\}/g, inputs.runDir))
    shellQuote: false

inputs:
  input:
    type: File
    secondaryFiles:
      - .bai
    inputBinding:
      prefix: '--bam'
      position: 1
      shellQuote: false
  reference_genome:
    type: File
    secondaryFiles:
      - .fai
      - .amb
      - .ann
      - .bwt
      - .pac
      - .sa
    inputBinding:
      prefix: '--referenceFasta'
      position: 1
      shellQuote: false
  exome:
    type: boolean?
    inputBinding:
      prefix: "--exome"
      position: 1
      shellQuote: false
  regions:
    type: File
    secondaryFiles:
      - .tbi
    inputBinding:
      prefix: '--callRegions'
      position: 1
      shellQuote: false
  runDir:
    type: string?
    default: "generated"
    inputBinding:
      prefix: '--runDir'
      position: 1
      shellQuote: false
CWL From line 22 of tools/manta.cwl
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
baseCommand: [bash, -c]

inputs:
  script:
    type: string?
    inputBinding:
      position: 1
    default: |
      INPUT_FILE=$0
      SAMPLE_FILE=$1
      MIN_LEN=$2
      MAX_LEN=$3
      MIN_Q=$4
      OUTPUT_NAME=${INPUT_FILE/raw/filtered}
      OUTPUT_FILE=$(echo "$OUTPUT_NAME" | sed "s/.*\///")

      # mapping case_id with sample_id
      SAMPLE_ID=(${OUTPUT_FILE//./ })
      CASE_ID=$(awk -v search="$SAMPLE_ID" '$0 ~ search{print $1}' "$SAMPLE_FILE")

      tail -n +2 $INPUT_FILE | awk -v maxLen=$MAX_LEN -v minLen=$MIN_LEN -v minQ=$MIN_Q '{ \
      chr=$1; \
      start=$2; \
      end=$6; \
      sample="'$CASE_ID'"; \
      q=$8; \
      len=(end-start)/1000; \
      type=$11; \
      tool="manta"; \
      if(len>minLen && \
        len<maxLen && \
        q>=minQ && \
        (type=="DEL" || type=="DUP")){
          print chr"\t"start"\t"end"\t"sample"\t"type"\t"q"\t"tool}}' > ${OUTPUT_FILE}

  input:
    type: File
    inputBinding:
      position: 2
  samples:
    type: File
    inputBinding:
      position: 3
  min_len:
    type: string?
    inputBinding:
      position: 4
  max_len:
    type: string?
    inputBinding:
      position: 5
  min_q:
    type: string?
    inputBinding:
      position: 6
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
baseCommand: [bash, -c]

inputs:
  script:
    type: string?
    inputBinding:
      position: 1
    default: |
      INPUT_FILE=$0
      OUTPUT_NAME=${INPUT_FILE/bed/sorted.merged.bed}
      OUTPUT_FILE=$(echo "$OUTPUT_NAME" | sed "s/.*\///")

      for i in $(cat $INPUT_FILE | sed -e "s/[[:space:]]\+/\t/g" | cut -f4 | sort -u)  # FIXME: remove sed -e "s/[[:space:]]\+/\t/g" try better solution
      do
        for j in "DEL" "DUP"
        do
          grep $i $INPUT_FILE | grep $j | sed -e "s/[[:space:]]\+/\t/g" | bedtools sort | bedtools merge -c 6,7 -o collapse | awk -v sample=$i -v type=$j '{ \
            print $1,$2,$3,sample,type,$4,$5}' OFS="\t"
        done >> ${OUTPUT_FILE}
      done
  input:
    type: File
    inputBinding:
      position: 2
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
baseCommand: [picard, MarkDuplicates]

arguments:
  - position: 2
    valueFrom: OUTPUT=$(inputs.input.nameroot).dedup.bam
  - position: 3
    valueFrom: METRICS_FILE=$(inputs.input.nameroot).dedup.metrics.txt
  - position: 4
    valueFrom: ASSUME_SORTED=TRUE

inputs:
  input:
    type: File
    inputBinding:
      position: 1
      prefix: 'INPUT='
      separate: false
18
19
20
21
22
23
24
25
26
27
28
29
30
31
baseCommand: []
arguments:
  - position: 0
    shellQuote: false
    valueFrom: >-
      $(inputs.input_index ? 'echo samtools faidx' : 'samtools faidx')

inputs:
  input_fasta:
    type: File
    inputBinding:
      position: 1
  input_index:
    type: File?
19
20
21
22
23
24
25
26
27
28
29
baseCommand: [samtools, index]

arguments:
  - position: 1
    valueFrom: -b

inputs:
  input:
    type: File
    inputBinding:
      position: 2
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
baseCommand: [samtools, merge, -f]

arguments:
  - position: 2
    valueFrom: $(inputs.paired.nameroot.split('.')[0]).sorted.bam

inputs:
  paired:
    type: File
    inputBinding:
      position: 3
  unpaired_R1:
    type: File
    inputBinding:
      position: 4
  unpaired_R2:
    type: File
    inputBinding:
      position: 5
  threads:
    type: int?
    default: 1
    inputBinding:
      position: 1
      prefix: '--threads'
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
baseCommand: [samtools, sort]

arguments:
  - position: 3
    prefix: '-o'
    valueFrom: $(inputs.input.nameroot).sorted.bam

inputs:
  input:
    type: File
    inputBinding:
      position: 2
  threads:
    type: int?
    default: 1
    inputBinding:
      position: 1
      prefix: '--threads'
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
baseCommand: [samtools, view]

arguments:
  - position: 5
    prefix: '-o'
    valueFrom: $(inputs.input.nameroot).filtered.bam

inputs:
  input:
    type: File
    inputBinding:
      position: 4
  min_mapping_quality:
    type: int
    inputBinding:
      position: 2
      prefix: '-q'
  bits_set:
    type: int
    inputBinding:
      position: 3
      prefix: '-F'
  threads:
    type: int?
    default: 1
    inputBinding:
      position: 1
      prefix: '--threads'
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
baseCommand: [samtools, view]

arguments:
  - position: 3
    prefix: '-o'
    valueFrom: $(inputs.input.nameroot).bam

inputs:
  input:
    type: File
    inputBinding:
      position: 2
      prefix: '-bS'
  threads:
    type: int?
    default: 1
    inputBinding:
      position: 1
      prefix: '--threads'
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
baseCommand: [Rscript]

inputs:
  input:
    type: File
    inputBinding:
      position: 2
  script:
    type: File
    inputBinding:
      position: 1
    default:
        class: File
        basename: "gridss.R"
        contents: |-
          library(stringr)
          library("VariantAnnotation")
          library("StructuralVariantAnnotation")

          args <- commandArgs(TRUE)
          vcfFile <- args[1]
          bedFile <- paste(sub('\\.vcf.gz$', '', vcfFile), ".bed", sep = "")

          simpleEventType <- function(gr) {
            return(
              ifelse(seqnames(gr) != seqnames(partner(gr)), "Translocation",
                     ifelse(gr$insLen >= abs(gr$svLen) * 0.7, "Insertion",
                            ifelse(strand(gr) == strand(partner(gr)), "Inversion",
                                   ifelse(xor(start(gr) < start(partner(gr)), strand(gr) == "-"), "DEL",
                                          "DUP")))))
          }
          options(scipen=999)
          full_vcf <- readVcf(vcfFile)
          gr <- breakpointRanges(full_vcf)
          bedpe <- data.frame(
            chrom1=seqnames(gr),
            start1=start(gr) - 1,
            end1=end(gr),
            chrom2=seqnames(partner(gr)),
            start2=start(partner(gr)) - 1,
            end2=end(partner(gr)),
            type=simpleEventType(gr),
            svLen=gr$svLen,
            insLen=gr$insLen,
            name=names(gr),
            score=gr$QUAL,
            strand1=strand(gr),
            strand2=strand(partner(gr))
          )

          bedpe <- bedpe[str_detect(bedpe$name, "gridss.+o"),]
          write.table(bedpe, bedFile, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE)
20
21
22
23
24
25
26
27
28
29
30
31
32
baseCommand: [svtools, vcftobedpe]

arguments:
  - position: 2
    prefix: '-o'
    valueFrom: $(inputs.input.nameroot.replace('vcf', 'bed'))

inputs:
  input:
    type: File
    inputBinding:
      position: 1
      prefix: '-i'
CWL From line 20 of tools/svtools.cwl
ShowHide 17 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://gitlab.bsc.es/lrodrig1/structuralvariants_poc/-/tree/1.1.3/structuralvariants/cwl
Name: cnv_pipeline
Version: 1.1.3
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: Boost Software License 1.0
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...