Reference alignment workflow

public public 1yr ago Version: v0.1 0 bookmarks

an example run script

configfile=config/config.yaml
threads=200
snakemake --configfile $configfile --cores $threads --use-conda -p

Or if you want to distribute over a cluster:

mkdir dir -p logs/drmaa
configfile=config/config.yaml
threads=200
snakemake --configfile $configfile --jobs $threads --use-conda -p \ --drmaa " -l centos=7 -l h_rt=48:00:00 -l mfree=8G -pe serial {threads} -V -cwd -S /bin/bash -w n" --drmaa-log-dir logs/drmaa

Or if you want to make ideograms:

configfile=config/config.yaml
threads=200
snakemake --configfile $configfile --cores $threads --use-conda -p ideogram

Notes on use of the pipeline in Vollger et al., 2023

Running alignment and gene conersion identification pipeline:

snakemake \ --configfile config/config_asm20.yaml \ --cores $threads \ --use-conda \ -p \ gene_conversion

Information on where to download the input assemblies can be found on Zenodo .

Config files for human assemblies:

config/config_asm20.yaml
config/table.asm.tbl

Config files for the Clint PTR assembly:

config/clint.yaml
config/clint.asm.tbl

Code Snippets

16
17
18
19
20
21
22
shell:
    """
    awk '$4-$3>{params.min_aln_len}' {input.paf} \
        | rb paf-to-sam \
        | samtools sort -@ {threads} -m {resources.mem}G \
    > {output.bam}
    """
38
39
40
41
42
43
44
shell:
    """
    awk '$4-$3>{params.min_aln_len}' {input.paf} \
        | rb paf-to-sam -f {input.query} \
        | samtools sort -@ {threads} -m {resources.mem}G \
    > {output.bam}
    """
58
59
60
61
shell:
    """
    samtools sort -@ {threads} -m {resources.mem}G {input.aln} > {output.bam}
    """
77
78
79
80
shell:
    """
    htsbox pileup -q0 -evcf {input.ref} {input.bam}  > {output.vcf}
    """
 96
 97
 98
 99
100
101
102
103
104
shell:
    """
    ( dipcall-aux.js vcfpair -s {wildcards.sm} -a {input.vcf} \
        | bcftools norm -Ov -m-any \
        | bcftools norm -Ov -d exact \
        | bcftools norm -Ov -m-any --fasta-ref {input.ref} --check-ref w \
        | bcftools sort -m {resources.mem}G \
        | htsbox bgzip > {output.vcf} ) 2> {log}
    """
114
115
116
117
shell:
    """
    bcftools index {input.vcf}
    """
139
140
141
142
143
144
145
146
147
148
149
150
151
152
shell:
    """
    if [ {params.n_samples} == 1 ]; then
        bcftools norm --threads {threads} -Ov -m-any {input.vcf} \
            | bgzip -@ {threads} \
            > {output.vcf}
    else 
        bcftools merge \
            --threads {threads} {input.vcf} \
            | bcftools norm --threads {threads} -Ov -m-any \
            | bgzip -@ {threads} \
            > {output.vcf}
    fi
    """
165
166
167
168
169
170
shell:
    """
    awk '$4-$3>{params.min_aln_len}' {input.paf} \
        | csvtk cut  -tT -f 6,8,9,1,3,4 \
        | bgzip > {output.bed}
    """
189
190
191
192
193
194
195
196
197
198
199
200
shell:
    """
    ls {input.bed} \
        | parallel -n 1 \
        $'zcat {} | cut -f 1-3 | sed \'s/$/\\t{{/.}}/g\' '
    > {output.bed}
    head {output.bed}

    python {params.add_gt} -v {input.vcf} {output.bed} \
        | bgzip -@ {threads} \
        > {output.vcf}
    """
214
215
216
217
218
219
220
221
222
shell:
    """
    ( echo '{params.header}'; \
        bcftools query \
                -f '%CHROM\t%POS0\t%END\t%CHROM-%POS-%TYPE-%REF-%ALT\t%TYPE\t%REF\t%ALT\t{wildcards.sm}\th1;h2\t[ %GT]\n' \
                {input.vcf} \
    ) \
        | bgzip > {output.bed}
    """
25
26
27
28
29
30
31
32
shell:
    """
    bedtools sort -i {input.bed} \
        | bedtools merge -i - \
        | bedtools slop -i - -g {input.genome} -b {params.slop} \
        | bedtools merge -i - \
        > {output.bed}
    """
52
53
54
55
56
57
58
59
60
shell:
    """
    awk '$4-$3>{params.min_aln_len}' {input.paf} \
        | rb --threads {threads} liftover --bed <( grep -v "^#" {input.bed}) \
        | csvtk cut  -tT -f 1,3,4 \
        | bedtools makewindows -s {params.slide} -w {params.window} -b - \
        | sed 's/$/\t{wildcards.sm}/g' \
    > {output.bed}
    """
78
79
80
81
82
83
84
85
86
87
88
89
shell:
    """
    grep -w {wildcards.sm} {input.bed} \
        | cut -f 1-3 \
        | bedtools sort -i -  > {output.tbed}

    ( rb -vv --threads {threads} liftover \
        -q --bed <( grep -v "^#" {output.tbed} ) \
        --largest {input.paf} \
        | grep -v "cg:Z:{params.window}=" \
        > {output.paf} ) 2> {log}
    """
106
107
108
109
110
shell:
    """
    seqtk seq -A -l 60 {input.query} > {output.fa}
    samtools faidx {output.fa}
    """
131
132
133
134
135
136
137
138
139
140
141
142
143
shell:
    """
    minimap2 -K 1G -t {threads} \
        -cx asm20 \
        -k 15 \
        --secondary=no --eqx \
        {input.ref} \
            <( bedtools getfasta -name+ \
                -fi {input.query} \
                -bed <(awk -v OFS=$'\t' '{{name=$1":"$3"-"$4}}{{print $6,$8,$9,name}}' {input.paf}) \
            ) \
        > {output.aln}
    """
156
157
158
159
160
161
162
shell:
    """
    rb --threads {threads} stats --paf {input.paf} \
        > {output.tbl}
    rb --threads {threads} stats --paf {input.liftover_paf} \
        > {output.liftover_tbl}
    """
175
176
script:
    "../scripts/combine-mappings.R"
189
190
191
192
193
194
195
196
shell:
    """
    python {params.find_pairs} \
        --fraction 0.01 --overlap 1 --source-windows \
        --distance {params.window} \
        --input {input.bed} \
    > {output.bed}
    """
213
214
215
216
217
218
219
220
shell:
    """
    rb liftover -q \
        --bed <(cut -f 1-3 {input.bed} | grep -v "^#" ) \
        --largest {input.paf} \
            | grep -v "cg:Z:{params.window}=" \
    > {output.paf} 2> {log}
    """
237
238
239
240
241
242
243
244
245
246
247
248
249
shell:
    """
    minimap2 -K 1G -t {threads} \
        -cx asm20 \
        -k 15 \
        --secondary=no --eqx \
        {input.ref} \
            <( bedtools getfasta -name+ \
                -fi {input.query} \
                -bed <(awk -v OFS=$'\t' '{{name=$1":"$3"-"$4}}{{print $6,$8,$9,name}}' {input.paf}) \
            ) \
        > {output.aln}
    """
262
263
264
265
266
267
268
shell:
    """
    rb --threads {threads} stats --paf {input.paf} \
        > {output.tbl}
    rb --threads {threads} stats --paf {input.liftover_paf} \
        > {output.liftover_tbl}
    """
281
282
script:
    "../scripts/combine-mappings.R"
294
295
296
297
298
299
300
shell:
    """
    python {params.find_pairs} \
        --fraction 0.5 --overlap 1 \
        --input {input.bed} \
    > {output.bed}
    """
316
317
script:
    "../scripts/gene-conversion-windows.R"
332
333
334
335
336
337
shell:
    """
    (head -n 1 {input.beds[0]}; tail -q -n +2 {input.beds}) \
        | pigz -p {threads} \
        > {output.bed}
    """
353
354
script:
    "../scripts/gene-conversion-windows.R"
369
370
371
372
373
374
shell:
    """
    bgzip -@ {threads} {input.acceptor} -c > {output.acceptor}
    bgzip -@ {threads} {input.bed} -c > {output.bed}
    bgzip -@ {threads} {input.interact} -c > {output.interact}
    """
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
shell:
    """
    # make interactions
    bedtools sort -i {input.interact} \
        > {output.bed}
    bedToBigBed -as={params.interact} \
        -type=bed5+13 {output.bed} {input.fai} {output.interact}

    # all windows
    bedToBigBed -as={params.fmt} -type=bed9+10 \
        {input.bed} {input.fai} {output.bb} 

    # bed graphs
    bedtools genomecov -i <(grep -w Donor {input.bed}) \
        -g {input.fai} -bg > {output.bg}
    bedGraphToBigWig {output.bg} {input.fai} {output.bwd}

    bedtools genomecov -i <(grep -w Acceptor {input.bed}) \
        -g {input.fai} -bg > {output.bg}
    bedGraphToBigWig {output.bg} {input.fai} {output.bwa}
    """
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
shell:
    """
     # make interactions
     bedtools sort -i {input.interact} \
         > {output.bed}
     bedToBigBed -as={params.interact} \
         -type=bed5+13 {output.bed} {input.fai} {output.interact}

    # make others
     bedToBigBed -as={params.fmt} -type=bed9+ \
         {input.bed} {input.fai} {output.bb} 


    csvtk cut  -tT -C "$" \
        -f query_name,query_start,query_end,mismatches \
        {input.liftover_tbl} \
        | tail -n+2 \
        | awk -v OFS=$'\t' '{{print $1,$2,$2+1,$4}}' \
        | bedtools sort -i - \
        | bedtools merge -i - -c 4 -o mean \
        > {output.bed}
    bedGraphToBigWig {output.bed} {input.fai} {output.bg}
    """
482
483
script:
    "../scripts/track_hub.py"
498
499
500
501
502
503
504
505
506
507
508
509
run:
    with open(output.tbl, "w+") as out:
        out.write("sample\thap\tfile\n")
        for sm, f in zip(params.samples, input):
            sm, hap = sm.split("_")
            out.write(
                "{}\t{}\t{}\n".format(
                    sm,
                    hap,
                    os.path.abspath(f),
                )
            )
528
529
530
531
532
533
534
535
536
537
538
shell:
    """
    head -n1 {input.bed[0]} > {output.tmp}
    cat {input.bed} | grep -v "^#" >> {output.tmp}

    python {params.find_pairs} \
        --fraction 0.8 --reciprocal \
        --input {output.tmp} \
        | bgzip -@ {threads} \
    > {output.bed}
    """
40
41
shell:
    "minimap2 -t {threads} -ax asm20 -d {output.mmi} {input.ref}"
60
61
62
63
64
65
66
67
shell:
    """
    minimap2 -t {threads} -a --eqx --cs \
        {params.mm2_opts} \
        {input.ref} {input.query} \
        | samtools view -F 4 -b - \
        > {output.aln} 2> {log}
    """
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
shell:
    """
    if [ {params.second_aln} == "yes" ]; then
      minimap2 -t {threads} -a --eqx --cs \
          {params.mm2_opts} \
          <(seqtk seq \
              -M <(samtools view -h {input.aln} | paftools.js sam2paf - | cut -f 6,8,9 | bedtools sort -i -) \
              -n "N" {input.ref_fasta} \
          ) \
          <(seqtk seq \
              -M <(samtools view -h {input.aln} | paftools.js sam2paf - | cut -f 1,3,4 | bedtools sort -i -) \
              -n "N" {input.query} \
          ) \
          | samtools view -F 4 -b - \
          > {output.aln} 2> {log}
    else
      samtools view -b -H {input.aln} > {output.aln}
    fi
    """
117
118
119
120
121
shell:
    """
    samtools cat {input.aln} {input.aln2} \
             -o {output.aln}
    """
133
134
135
136
137
138
shell:
    """
    samtools view -h {input.aln} \
        | paftools.js sam2paf - \
    > {output.paf}
    """
150
151
152
153
154
155
shell:
    """
    rustybam trim-paf {input.paf} \
        | rustybam break-paf --max-size {params.break_paf} \
    > {output.paf}
    """
167
168
169
170
shell:
    """
    rb --threads {threads} stats {input.aln} > {output.bed}
    """
184
185
186
187
188
189
190
shell:
    """
    Rscript {params.smkdir}/scripts/ideogram.R \
      --asm {input.bed} \
      --asm2 {input.bed2} \
      --plot {output.pdf}
    """
203
204
205
206
207
208
209
shell:
    """
    {params.smkdir}/scripts/ends_from_paf.py \
      --minwidth 10 \
      --width 1 \
      {input.paf} > {output.bed}
    """
221
222
223
224
225
226
227
228
shell:
    """
    rb liftover --largest --qbed \
        --bed <( grep -v "^#" {input.bed} ) \
        {input.paf} \
      | rb stats --paf --qbed \
      > {output.bed}
    """
239
240
241
242
243
244
245
246
247
shell:
    """
    head -n 1 {input.beds[0]} > {output.bed}
    cat {input.beds} \
      | grep -v "^#" \
      | awk '$2 > $3 {{ temp = $3; $3 = $2; $2 = temp }} 1' OFS='\t' \
      | bedtools sort -i - \
      >> {output.bed}
    """
259
260
261
262
263
264
265
266
267
shell:
    """
    bedtools intersect -wa -wb -header \
      -a <(printf "#chr\tstart\tend\n" ; bedtools makewindows -w 1000000 -g {input.fai} ) \
      -b {input.bed} \
      > {output.bed}
    header=$(head -n1 {input.bed})
    sed -i " 1 s/$/\t$header/" {output.bed}
    """
279
280
281
282
283
284
285
286
shell:
    """
    bedtools nuc \
      -fi {input.ref} \
      -bed <(bedtools makewindows -s 100 -w 1000 -g {input.fai} ) \
      | pigz \
      > {output.allbed}
    """
299
300
301
302
303
304
305
shell:
    """
    bedtools intersect -header -u -a {input.allbed} \
      -b <(bedtools slop -b 10000 -g {input.fai} -i {input.bed}) \
      | pigz \
      > {output.bed}
    """
15
16
17
18
19
20
21
22
shell:
    """
    rb break-paf --max-size {params.break_saffire} {input.paf} \
        | rb orient \
        | rb filter --paired-len {params.paired_aln_len} \
        | rb stats --paf  \
    > {output.bed}
    """
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
source("workflow/scripts/plotutils.R")
sample <- "HG002_1"
windowf <- "results/CHM13_V1.1/gene-conversion/HG002_1_windows.tbl.gz"
liftoverf <- "results/CHM13_V1.1/gene-conversion/HG002_1_liftover_windows.tbl.gz"
liftoverf <- "/Users/mrvollger/Desktop/EichlerVolumes/assembly_breaks/nobackups/asm-to-reference-alignment/results/CHM13_V1.1/gene-conversion/HG002_1_liftover_windows.tbl.gz"
windowf <- "/Users/mrvollger/Desktop/EichlerVolumes/assembly_breaks/nobackups/asm-to-reference-alignment/results/CHM13_V1.1/gene-conversion/HG002_1_windows.tbl.gz"
windowf <- snakemake@input$window
liftoverf <- snakemake@input$liftover
sample <- snakemake@wildcards$sm
window <- 10000
window <- snakemake@params$window
print(windowf)
print(liftoverf)
print(sample)
options(scipen = 999)


window.df <- fread(windowf) %>%
    separate(query_name,
        into = c("original_mapping", "original_source"), sep = "::"
    ) %>%
    separate(original_source,
        into = c("contig", "start", "end"),
        sep = ":|-",
        remove = FALSE
    ) %>%
    mutate(
        contig_start = as.numeric(start) + query_start,
        contig_end = as.numeric(start) + query_end
    ) %>%
    dplyr::select(-query_start, -query_end, -query_length, -start, -end) %>%
    data.table()

liftover.df <- fread(liftoverf) %>%
    mutate(
        original_mapping = paste(
            query_name, ":",
            query_start, "-", query_end,
            sep = ""
        ),
        original_source = paste(
            `#reference_name`, ":",
            reference_start, "-", reference_end,
            sep = ""
        ),
    ) %>%
    dplyr::rename(
        contig = `#reference_name`,
        contig_start = `reference_start`,
        contig_end = `reference_end`,
        reference_name.liftover = query_name,
        reference_start = query_start,
        reference_end = query_end
    ) %>%
    dplyr::select(-query_length) %>%
    data.table()

overlap_bp <- function(df) {
    # x1 <= y2 && y1 <= x2
    intersects <- df$`#reference_name` == df$reference_name.liftover &
        df$reference_start <= df$reference_end.liftover &
        df$reference_start.liftover <= df$reference_end
    overlap <- df$reference_end.liftover - df$reference_start
    intersects * overlap
}

df <- merge(
    window.df,
    liftover.df,
    by = c("original_mapping", "original_source"),
    suffixes = c("", ".liftover")
) %>%
    mutate(overlap = overlap_bp(.)) %>%
    filter(
        overlap == 0 &
            # matches + mismatches >= 0.9 * window &
            # matches.liftover + mismatches.liftover >= 0.9 * window &
            matches + mismatches >= 0.9 * (matches.liftover + mismatches.liftover) &
            matches - matches.liftover >= 0
    ) %>%
    relocate(original_mapping, .after = last_col()) %>%
    relocate(original_source, .after = last_col()) %>%
    arrange(`#reference_name`, `reference_start`, `reference_end`) %>%
    data.table()
df$sample <- sample

write.table(df, file = snakemake@output$bed, sep = "\t", row.names = F, quote = F)
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os
import sys
import argparse
import pandas as pd

"""
Col	Type	Description
1	string	Query sequence name
2	int	Query sequence length
3	int	Query start (0-based; BED-like; closed)
4	int	Query end (0-based; BED-like; open)
5	char	Relative strand: "+" or "-"
6	string	Target sequence name
7	int	Target sequence length
8	int	Target start on original strand (0-based)
9	int	Target end on original strand (0-based)
10	int	Number of residue matches
11	int	Alignment block length
12	int	Mapping quality (0-255; 255 for missing)
"""
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument("infile", help="positional input")
    parser.add_argument(
        "-d", help="store args.d as true if -d", action="store_true", default=False
    )
    parser.add_argument(
        "--minwidth", help="minimum alignment size to keep", type=int, default=5e4
    )
    parser.add_argument("-w", "--width", help="end size", type=int, default=1000)
    args = parser.parse_args()
    df = pd.read_csv(
        args.infile, sep="\t", header=0, usecols=range(12), names=list(range(12))
    )

    # filter alignments that are too small to consider
    # unless they make up the entire contig more or less
    # remove = (df[10] < args.minwidth) & (df[1] > 2*args.minwidth)
    # df = df[~remove]

    outfmt = "{}\t{}\t{}\t{}\t{}\t{}"
    for name, g in df.groupby(0):
        length = g[1].min()
        mmin = g[2].min()
        mmax = g[3].max()
        if (mmax - mmin) < 2 * args.width:
            print(outfmt.format(name, mmin, mmax, name + "_full", length, "full"))
        else:
            print(
                outfmt.format(
                    name, mmin, mmin + args.width, name + "_start", length, "start"
                )
            )
            print(
                outfmt.format(
                    name, mmax - args.width, mmax, name + "_end", length, "end"
                )
            )
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
source("workflow/scripts/plotutils.R")
f <- "/Users/mrvollger/Desktop/EichlerVolumes/assembly_breaks/nobackups/asm-to-reference-alignment/results/CHM13_V1.1/gene-conversion/all_candidate_windows.tbl.gz"
f <- "results/CHM13_V1.1/gene-conversion/all_candidate_windows.tbl.gz"
window <- 1e4
window <- snakemake@params$window
simplify <- snakemake@params$simplify
merge <- snakemake@params$merge

f <- snakemake@input$bed
print(f)
options(scipen = 999)

ttdf <- fread(f, nThread = 8, sep = "\t") %>%
    mutate(reference_name = `#reference_name`) %>%
    group_by(group, contig, reference_name, reference_name.liftover, sample)

if (merge) {
    tdf <- ttdf %>%
        ungroup() %>%
        group_by(group, reference_name, reference_name.liftover)
}

if (simplify) {
    print("SIMPLIFY")
    tdf <- ttdf %>%
        mutate(mmscore = matches - matches.liftover) %>%
        group_by(
            group, contig, reference_name, reference_name.liftover, sample
        ) %>%
        slice_max(mmscore, n = 1, with_ties = FALSE) %>%
        dplyr::select(-mmscore)
    print(paste(nrow(ttdf), nrow(tdf)))
}

df <- tdf %>%
    summarise(
        `#reference_name` = unique(`#reference_name`),
        reference_name = unique(reference_name),
        reference_start = min(reference_start),
        reference_end = max(reference_end),
        matches = sum(matches),
        mismatches = sum(mismatches),
        deletion_events = sum(deletion_events),
        insertion_events = sum(insertion_events),
        deletions = sum(deletions),
        insertions = sum(insertions),
        reference_name.liftover = unique(reference_name.liftover),
        reference_start.liftover = min(reference_start.liftover),
        reference_end.liftover = max(reference_end.liftover),
        matches.liftover = sum(matches.liftover),
        mismatches.liftover = sum(mismatches.liftover),
        deletion_events.liftover = sum(deletion_events.liftover),
        insertion_events.liftover = sum(insertion_events.liftover),
        deletions.liftover = sum(deletions.liftover),
        insertions.liftover = sum(insertions.liftover),
        sample = paste(unique(sample), collapse = ";"),
        # sample= unique(sample),
        contig = paste(unique(contig), collapse = ";"),
        contig_start = min(contig_start),
        contig_end = max(contig_end),
        overlap = sum(overlap)
    ) %>%
    mutate(
        perID_by_matches = 100 * matches / (matches + mismatches),
        perID_by_events =
            100 * matches / (matches + mismatches + insertion_events + deletion_events),
        perID_by_all =
            100 * matches / (matches + mismatches + insertions + deletions),
        perID_by_matches.liftover =
            100 * matches.liftover / (matches.liftover + mismatches.liftover),
        perID_by_events.liftover =
            100 * matches.liftover / (matches.liftover + mismatches.liftover + insertion_events.liftover + deletion_events.liftover),
        perID_by_all.liftover =
            100 * matches.liftover / (matches.liftover + mismatches.liftover + insertions.liftover + deletions.liftover),
        original_source =
            paste(contig, ":", contig_start, "-", contig_end, sep = "")
    ) %>%
    data.table()

remove_par <- (df$reference_name == "chrX" &
    df$reference_name.liftover == "chrY") |
    (df$reference_name == "chrY" &
        df$reference_name.liftover == "chrX")
df <- df[!remove_par, ]

df
# reference_name == reference_name.liftover &
gc.df <- df[
    (perID_by_matches >= 99.5 &
        perID_by_all > perID_by_all.liftover &
        (
            (mismatches.liftover - mismatches >= 2 * window / 1e4) |
                (mismatches.liftover / mismatches > 2)
        )
    )
]
dim(gc.df)
print("data subset")
if (F) {
    dim(gc.df)
    p <- df[perID_by_all >= 99.0 &
        perID_by_all - perID_by_all.liftover > 0.00 &
        matches - matches.liftover > 0 &
        matches + mismatches > 9e3 &
        matches.liftover + mismatches.liftover > 9e3 &
        `#reference_name` != "chrY"] %>%
        filter(perID_by_events > 99.5) %>%
        ggplot() +
        geom_histogram(aes(matches - matches.liftover),
            binwidth = 1
        ) +
        facet_zoom(xlim = c(0, 20)) +
        # scale_y_continuous(trans = "log10") +
        # scale_x_continuous(trans = "log10") +
        # annotation_logticks() +
        theme_cowplot()
    nrow(p$data)
    p <- p + annotate("text",
        x = 500, y = 500,
        label = paste("n = ", comma(nrow(p$data))),
        size = 3
    )
    ggsave("~/Desktop/gc.pdf", plot = p)
}

gc.df$name <- paste(
    gc.df$mismatches.liftover - gc.df$mismatches,
    ";",
    gc.df$reference_name,
    ":",
    gc.df$reference_start,
    sep = ""
)
gc.df$name <- gc.df$mismatches.liftover - gc.df$mismatches

gc.df$strand <- "."
gc.df$color <- "0,127,211"
gc.df$score <- pmin(gc.df$mismatches.liftover - gc.df$mismatches, 1000)
gc.df$thickStart <- gc.df$reference_start.liftover
gc.df$thickEnd <- gc.df$reference_end.liftover
gc.df$status <- "Acceptor"

odf <- gc.df[, c(
    "reference_name.liftover",
    "reference_start.liftover",
    "reference_end.liftover",
    "name",
    "score",
    "strand",
    "thickStart",
    "thickEnd",
    "color",
    "status",
    "reference_name",
    "reference_start",
    "reference_end",
    "mismatches.liftover",
    "mismatches",
    "perID_by_all.liftover",
    "perID_by_all",
    "sample",
    "original_source"
)]
fwrite(
    odf %>%
        arrange(reference_name.liftover, reference_start.liftover) %>%
        dplyr::rename(`#reference_name.liftover` = reference_name.liftover) %>%
        data.table(),
    file = snakemake@output$acceptor,
    sep = "\t",
    row.names = F,
    quote = F,
    scipen = 999
)

#
# add donor sites
#
odf2 <- data.table(copy(odf))
odf2$color <- "211,144,0"
odf2$status <- "Donor"

odf2$reference_name.liftover <- odf$reference_name
odf2$reference_start.liftover <- odf$reference_start
odf2$reference_end.liftover <- odf$reference_end

odf2$reference_name <- odf$reference_name.liftover
odf2$reference_start <- odf$reference_start.liftover
odf2$reference_end <- odf$reference_end.liftover

odf2$thickStart <- odf2$reference_start.liftover
odf2$thickEnd <- odf2$reference_end.liftover

fwrite(
    rbind(odf, odf2) %>%
        arrange(reference_name.liftover, reference_start.liftover) %>%
        dplyr::rename(`#reference_name.liftover` = reference_name.liftover) %>%
        data.table(),
    file = snakemake@output$bed,
    sep = "\t",
    row.names = F,
    quote = F,
    scipen = 999
)
# make the interaction file
names <- c(
    "#chrom", "chromStart", "chromEnd",
    "name",
    "score",
    "value",
    "exp",
    "color",
    "sourceChrom", "sourceStart", "sourceEnd", "sourceName", "sourceStrand",
    "targetChrom", "targetStart", "targetEnd", "targetName", "targetStrand"
)
sdf <- copy(odf)
ndf <- data.table()

ndf$`#chrom` <- sdf$`reference_name.liftover`
ndf$chromStart <- sdf$`reference_start.liftover`
ndf[sdf$`reference_start.liftover` > sdf$reference_start]$chromStart <-
    copy(sdf[sdf$`reference_start.liftover` > sdf$reference_start]$reference_start)

ndf$chromEnd <- sdf$`reference_end.liftover`
ndf[sdf$`reference_end.liftover` < sdf$reference_end]$chromEnd <-
    copy(sdf[sdf$`reference_end.liftover` < sdf$reference_end]$reference_end)

ndf$name <- "."
ndf$score <- pmin(sdf$mismatches.liftover - sdf$mismatches, 1000)
ndf$value <- sdf$mismatches.liftover / sdf$mismatches
ndf$exp <- "."
ndf$color <- 0

ndf$sourceChrom <- sdf$reference_name.liftover
ndf$sourceStart <- sdf$reference_start.liftover
ndf$sourceEnd <- sdf$reference_end.liftover
ndf$sourceName <- "."
ndf$sourceStrand <- "."

ndf$targetChrom <- sdf$reference_name
ndf$targetStart <- sdf$reference_start
ndf$targetEnd <- sdf$reference_end
ndf$targetName <- "."
ndf$targetStrand <- "."

# fix the columns when interchromosomal
inter <- as.character(sdf$reference_name) != as.character(sdf$reference_name.liftover)
sum(inter)
dim(odf)
dim(sdf)
dim(ndf)
print(head(ndf))
if (T) {
    print("inter: setting 1-3")
    ndf[inter]$`#chrom` <- sdf[inter]$reference_name.liftover
    ndf[inter]$chromStart <- sdf[inter]$reference_start.liftover
    ndf[inter]$chromEnd <- sdf[inter]$reference_end.liftover

    print("inter: setting source")
    ndf[inter]$sourceChrom <- sdf[inter]$reference_name
    ndf[inter]$sourceStart <- sdf[inter]$reference_start
    ndf[inter]$sourceEnd <- sdf[inter]$reference_end

    print("inter: setting target name")
    ndf[inter]$targetChrom <- sdf[inter]$reference_name.liftover
    print("inter: setting target start")
    ndf[inter]$targetStart <- sdf[inter]$reference_start.liftover
    print("inter: setting target end")
    ndf[inter]$targetEnd <- sdf[inter]$reference_end.liftover
} else {
    ndf <- ndf[!inter]
}
print(head(ndf))
print(dim(ndf))
fwrite(
    ndf,
    file = snakemake@output$interact,
    sep = "\t",
    row.names = F,
    quote = F,
    scipen = 999
)
print("written")
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
library(tidyverse)
library(ggnewscale)
library(ggrepel)
library(data.table)
library(glue)
library(RColorBrewer)
library(scales)
library(cowplot)
library(argparse)
library(karyoploteR)
library(GenomicRanges)

dir <- paste0(getwd(), "/workflow/scripts")
load(glue("{dir}/chm13.karyo.RData"))


# create parser object
indir <- "~/Desktop"
parser <- ArgumentParser()
parser$add_argument("-a",
                    "--asm",
                    help = "bed file with all the asm mapping",
                    default = glue("{indir}/GM08714_1.bed"))
parser$add_argument("-b", "--asm2", help = "bed file with a second asm mapping")#, default = glue("{indir}/GM08714_2.bed"))
parser$add_argument("-k", "--karyotype", help = "karyotpye file for different genomes")
parser$add_argument("--min", help = "minimum amount of total alginemnts between a target and query for it to appear", default = 1e6)
parser$add_argument("-p", "--plot", help = "output plot, must have .pdf ext.", default = "~/Desktop/ideogram.pdf")
args <- parser$parse_args()
filename <- args$asm


asmdf <- function(filename, colors, minalnsize = args$min) {
  asmvshg <- read.table(filename, header = T, comment.char = ">")
  names(asmvshg)[1:3] <- c("chr", "start", "end")
  asmvshg <- asmvshg %>%
    group_by(query_name, chr) %>%
    summarise(
      bp_aligned = sum(end - start),
      min_start = min(start),
      max_end = max(end),
    ) %>%
    filter(bp_aligned > minalnsize) %>%
    merge(asmvshg) %>%
    mutate(group_num = group_indices(., query_name, chr)) %>%
    group_by(query_name, chr) %>%
    mutate(count_in_chr = n(),) %>%
    ungroup() %>%
    arrange(chr, min_start) %>%
    data.table()

  asmvshg$name <- asmvshg$query_name
  print(head(asmvshg))
  print(tail(asmvshg))
  curcolor <- 1
  lencolors <- length(colors)
  precontig <- ""
  asmcolor <- NULL
  seen <- c()
  y <- NULL
  for (i in 1:nrow(asmvshg)) {
    contig <- as.character(asmvshg$name[i])
    if (contig != precontig) {
      curcolor <- (curcolor + 1) %% lencolors
      precontig <- contig
    }
    asmcolor <- c(asmcolor, colors[curcolor + 1])
    y <- c(y, curcolor / 4)
  }
  asmvshg$color <- asmcolor
  asmvshg$y <- y
  asmvshg$y1 <- asmvshg$y + .25
  print(head(asmvshg))
  return(asmvshg)
}

asmvshg <- asmdf(filename, c("#2081f9", "#f99820"))

tables = list(asmvshg)
if (!is.null(args$asm2)) {
  asmvshg2 <- asmdf(args$asm2, c("#159934", "#99157a"))
  tables[[2]] = asmvshg2
}
tables


cex <- 0.5

print("Plotting")

pdf(file = args$plot,
    width = 9,
    height = 11)

if (is.null(args$asm2)) {
  kp <-
    plotKaryotype(genome = GENOME,
                  cytobands = CYTOFULL,
                  chromosomes = NOM)
} else {
  kp <-
    plotKaryotype(
      genome = GENOME,
      cytobands = CYTOFULL,
      chromosomes = NOM,
      plot.type = 2
    )
}

for (i in 1:length(tables)) {
  data = tables[[i]]
  data$mid = (data$y + data$y1) / 2

  data_u = data %>%
    group_by(chr, query_name, mid, min_start, max_end, color) %>%
    summarise()

  # add spanning lines
  kpRect(
    kp,
    chr = data_u$chr,
    x0 = data_u$min_start,
    x1 = data_u$max_end,
    y0 = data_u$mid,
    y1 = data_u$mid,
    col = data_u$color,
    data.panel = i
  )
  # add colored blocks
  kpRect(
    kp,
    chr = data$chr,
    x0 = data$start,
    x1 = data$end,
    y0 = data$y,
    y1 = data$y1,
    col = data$color,
    data.panel = i
  )
}

dev.off()
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
hub = """
hub gene-conversion
shortLabel gene-conversion
longLabel gene-conversion
genomesFile genomes.txt
email mvollger.edu
"""

genomes = """
genome {ref}
trackDb trackDb.txt
"""

track_db_header = """
track gene-conversion
compositeTrack off
shortLabel gene-conversion
longLabel gene-conversion
visibility hide
type bigBed 9 +
itemRgb on
maxItems 100000
filter.score 5:1000
filterByRange.score on
filterLimits.score 0:1000
filterLabel.score Minimum decrease in mismatches
"""

track = """
    track g-c-{sm}
    parent gene-conversion
    bigDataUrl gene-conversion/{sm}.bb
    shortLabel {sm} gc D/A
    longLabel {sm} gene conversion
    type bigBed 9 +
    itemRgb on
    priority {pri}
    visibility dense
"""


track_db_interact_header = """
track interact-gene-conversion
compositeTrack off
shortLabel interact-gc
longLabel gene conversion interactions
visibility hide
type bigInteract
maxItems 100000
filter.score 5:1000
filterByRange.score on
filterLimits.score 0:1000
"""

track_interact = """
    track interact-g-c-{sm}
    parent gene-conversion
    bigDataUrl gene-conversion/{sm}.interact.bb
    shortLabel {sm} gc
    longLabel {sm} gene conversion interactions
    type bigInteract
    maxHeightPixels 100:30:5
    priority {pri2}
    visibility full
"""

track_super = """
track gene-conversion-by-sample
superTrack on show
shortLabel gc-by-sample
longLabel gene conversion by sample
filter.score 5:1000
filterByRange.score on
filterLimits.score 0:1000

"""

track_comp = """
    track {sm}
    parent gene-conversion-by-sample
    compositeTrack on
    shortLabel {sm}-gc
    longLabel {sm} gene conversion
    type bigWig
    visibility full

        track gc-{sm}
        parent {sm}
        bigDataUrl gene-conversion/{sm}.bb
        shortLabel {sm} gc
        longLabel {sm} gene conversion
        type bigBed 9 +
        itemRgb on
        visibility dense

        track interact-{sm}
        parent {sm}
        bigDataUrl gene-conversion/{sm}.interact.bb
        shortLabel {sm} interact
        longLabel {sm} interactions
        type bigInteract
        maxHeightPixels 100:30:5
        visibility full

"""


all_tracks = """
track g-c-interact
bigDataUrl all_candidate_interactions.bb
shortLabel all gc interact
longLabel all gene conversion interactions
type bigInteract
visibility hide

track Donor 
bigDataUrl all_candidate_windows_donor.bw
shortLabel Donor 
longLabel Donor
type bigWig
color 211,144,0
autoScale on
visibility full

track Acceptor 
bigDataUrl all_candidate_windows_acceptor.bw
shortLabel Acceptor 
longLabel Acceptor
type bigWig
color 0,127,211
autoScale on
visibility full

track gene-conversion-windows
bigDataUrl all_candidate_windows.bb
shortLabel all g-c windows
longLabel all gene conversion windows
type bigBed 9 +
itemRgb on
visibility dense
maxItems 100000

"""


view_format_super = """
# SuperTrack declaration no type or visibility is required
# However "show" is needed to have a superTrack on by  default
track gene-conversion
longLabel gene conversion for HPRC samples
superTrack on show
shortLabel gene-conversion

"""
view_format_comp = """
    # Composite declaration, usually composite tracks are all of one type,
    #   and the type can be declared.
    # When a mixed type (some bigBeds, some bigWigs) you need to use the unusual
    #   'type bed 3' declaration.
    # The subGroup1 line will define groups,
    #   in this case the special 'view' group
    #   (a new subGroup2 could be metadata)
    # Later individual tracks will identify what 'subGroups id' they belong to.
    track gene-conversion-by-sample
    type bed 3
    longLabel gene conversion by sample
    parent gene-conversion
    compositeTrack on
    shortLabel gc-by-sample
    visibility full
    subGroup1 view Views bb=Colored_bigBed_items int=Interact_Data bg=BigBedGraph_items

"""
view_fromat_bb = """
        # This is the unexpected part about views,
        #    you need a separate parent to group the view
        # So this new view-specific stanza with "view id"
        #    can collect all tracks with some visibility settings
        track gene-conversion-by-sample-bb
        parent gene-conversion-by-sample on
        view bb
        visibility pack
        itemRgb on 
        maxItems 100000
        filter.score 5:1000
        filterByRange.score on
        filterLimits.score 0:1000

"""
view_format_bb_sm = """
            # Child bigBed in this view
            # The 'subGroups view=bb' shares this track belongs in a view,
            #    even though a parent declaration is also needed
            # All these tracks should be the same type of data
            track gene-conversion-by-sample-bb-{sm}
            parent gene-conversion-by-sample-bb
            type bigBed 9 +
            longLabel {sm} gene conversion bb 
            bigDataUrl gene-conversion/{sm}.bb
            shortLabel {sm}-gc-bb
            subGroups view=bb

"""
view_format_int = """
        # New View Stanza that collects all interact in this composite
        # This declares related bigInteract tracks    
        track gene-conversion-by-sample-interact
        parent gene-conversion-by-sample on
        view int
        visibility full
        maxHeightPixels 100:55:5
        maxItems 100000
        filter.score 5:1000
        filterByRange.score on
        filterLimits.score 0:1000

"""
view_format_int_sm = """
            # Child one Interact
            track gene-conversion-by-sample-interact-{sm}
            parent gene-conversion-by-sample-interact
            type bigInteract
            longLabel {sm} gene conversion interactions
            bigDataUrl gene-conversion/{sm}.interact.bb
            shortLabel {sm}-gc-interact
            subGroups view=int

"""

view_format_bg = """
        track gene-conversion-by-sample-bg
        parent gene-conversion-by-sample on
        view bg
        visibility full
        maxHeightPixels 100:10:5
        maxItems 100000
"""

view_format_bg_sm = """
            track gene-conversion-by-sample-bg-{sm}
            parent gene-conversion-by-sample-bg
            longLabel {sm} gene conversion bg
            bigDataUrl gene-conversion/{sm}.bg
            shortLabel {sm}-gc-bg
            subGroups view=bg
            autoScale Off
            graphTypeDefault Bar
            gridDefault OFF
            windowingFunction Mean
            color 175,4,4
            altColor 47,79,79
            viewLimits 0:5
            type bigWig 0 1000
"""

with open(snakemake.output.track, "w") as out:
    out.write(all_tracks)
    if False:
        out.write(track_db_header)
        # out.write(track_db_interact_header)
        [
            out.write(
                (track + track_interact).format(
                    sm=sm, pri=2 * idx + 1, pri2=2 * idx + 2
                )
            )  # pri=idx + 1, pri2=idx + 2))
            for idx, sm in enumerate(snakemake.params.samples)
        ]
    elif True:
        out.write(view_format_super)
        out.write(view_format_comp)
        # big beds
        out.write(view_fromat_bb)
        [out.write(view_format_bb_sm.format(sm=sm)) for sm in snakemake.params.samples]
        # bigInteract
        out.write(view_format_int)
        [out.write(view_format_int_sm.format(sm=sm)) for sm in snakemake.params.samples]
        # bedGraph
        out.write(view_format_bg)
        [out.write(view_format_bg_sm.format(sm=sm)) for sm in snakemake.params.samples]
    else:
        out.write(track_super)
        [out.write(track_comp.format(sm=sm)) for sm in snakemake.params.samples]

open(snakemake.output.hub, "w").write(hub)

ref = snakemake.wildcards.ref
if "CHM13_V1.1" in ref:
    print("changing ref")
    ref = "t2t-chm13-v1.1"
open(snakemake.output.genomes, "w").write(genomes.format(ref=ref))
ShowHide 40 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/mrvollger/asm-to-reference-alignment
Name: asm-to-reference-alignment
Version: v0.1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...