Workflow Steps and Code Snippets

361 tagged steps and code snippets that match keyword seaborn

Snakemake workflow: dna-seq-gatk-variant-calling (v2.1.1)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import sys
sys.stderr = open(snakemake.log[0], "w")

import common
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

calls = pd.read_table(snakemake.input[0], header=[0, 1])
samples = [name for name in calls.columns.levels[0] if name != "VARIANT"]
sample_info = calls.loc[:, samples].stack([0, 1]).unstack().reset_index(1, drop=False)
sample_info = sample_info.rename({"level_1": "sample"}, axis=1)

sample_info = sample_info[sample_info["DP"] > 0]
sample_info["freq"] = sample_info["AD"] / sample_info["DP"]
sample_info.index = np.arange(sample_info.shape[0])

plt.figure()

sns.stripplot(x="sample", y="freq", data=sample_info, jitter=True)
plt.ylabel("allele frequency")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.freqs)

plt.figure()

sns.stripplot(x="sample", y="DP", data=sample_info, jitter=True)
plt.ylabel("read depth")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.depths)

Snakemake workflow: dna-seq-gatk-variant-calling (v2.1.1)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import sys
sys.stderr = open(snakemake.log[0], "w")

import common
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

calls = pd.read_table(snakemake.input[0], header=[0, 1])
samples = [name for name in calls.columns.levels[0] if name != "VARIANT"]
sample_info = calls.loc[:, samples].stack([0, 1]).unstack().reset_index(1, drop=False)
sample_info = sample_info.rename({"level_1": "sample"}, axis=1)

sample_info = sample_info[sample_info["DP"] > 0]
sample_info["freq"] = sample_info["AD"] / sample_info["DP"]
sample_info.index = np.arange(sample_info.shape[0])

plt.figure()

sns.stripplot(x="sample", y="freq", data=sample_info, jitter=True)
plt.ylabel("allele frequency")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.freqs)

plt.figure()

sns.stripplot(x="sample", y="DP", data=sample_info, jitter=True)
plt.ylabel("read depth")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.depths)

Snakemake workflow: dna-seq-gatk-variant-calling (v2.1.1)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import sys
sys.stderr = open(snakemake.log[0], "w")

import common
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

calls = pd.read_table(snakemake.input[0], header=[0, 1])
samples = [name for name in calls.columns.levels[0] if name != "VARIANT"]
sample_info = calls.loc[:, samples].stack([0, 1]).unstack().reset_index(1, drop=False)
sample_info = sample_info.rename({"level_1": "sample"}, axis=1)

sample_info = sample_info[sample_info["DP"] > 0]
sample_info["freq"] = sample_info["AD"] / sample_info["DP"]
sample_info.index = np.arange(sample_info.shape[0])

plt.figure()

sns.stripplot(x="sample", y="freq", data=sample_info, jitter=True)
plt.ylabel("allele frequency")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.freqs)

plt.figure()

sns.stripplot(x="sample", y="DP", data=sample_info, jitter=True)
plt.ylabel("read depth")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.depths)

Snakemake workflow: dna-seq-gatk-variant-calling (v2.1.1)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import sys
sys.stderr = open(snakemake.log[0], "w")

import common
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

calls = pd.read_table(snakemake.input[0], header=[0, 1])
samples = [name for name in calls.columns.levels[0] if name != "VARIANT"]
sample_info = calls.loc[:, samples].stack([0, 1]).unstack().reset_index(1, drop=False)
sample_info = sample_info.rename({"level_1": "sample"}, axis=1)

sample_info = sample_info[sample_info["DP"] > 0]
sample_info["freq"] = sample_info["AD"] / sample_info["DP"]
sample_info.index = np.arange(sample_info.shape[0])

plt.figure()

sns.stripplot(x="sample", y="freq", data=sample_info, jitter=True)
plt.ylabel("allele frequency")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.freqs)

plt.figure()

sns.stripplot(x="sample", y="DP", data=sample_info, jitter=True)
plt.ylabel("read depth")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.depths)

In this repository, we present the code for the analysis of study of the transcription's impact on Escherichia coli chromosome.

  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
import bacchus.directional as bcd
import bacchus.insulation as bci
import bacchus.io as bcio
import cooler
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.collections as mc
import numpy as np
import os
import pandas as pd
import scipy.sparse as sp
import scipy.stats as st
import seaborn as sns
from random import choices

mat_file = snakemake.input.mat
rna_file = snakemake.input.rna
annotation_file = snakemake.input.annotation
cmap = snakemake.params.cmap
dpi = snakemake.params.dpi
text_file = str(snakemake.output.text_file)
full_mat_file = str(snakemake.output.full_mat)
zoom_1_file = str(snakemake.output.zoom1)
zoom_2_file = str(snakemake.output.zoom2)
full_mat_di_file = str(snakemake.output.full_mat_di)
zoom_1_di_file = str(snakemake.output.zoom1_di)
zoom_2_di_file = str(snakemake.output.zoom2_di)
out_files_mat = [full_mat_file, zoom_1_file, zoom_2_file]
out_files_mat_di = [full_mat_di_file, zoom_1_di_file, zoom_2_di_file]
bor_trans = str(snakemake.output.bor_trans)
trans_bor = str(snakemake.output.trans_bor)

# Create outdir if necessary.
os.makedirs(str(snakemake.params.outdir), exist_ok=True)


def write_stats(text_file, text, writing_type="a"):
    with open(text_file, writing_type) as out:
        out.write(f"{text}\n")


# Compute CIDs at 5kb resolution with 100kb window size.
M5000 = cooler.Cooler(f"{mat_file}::/resolutions/{5000}").matrix(
    balance=True, sparse=True
)[:]
di5000 = bcd.directional_index(M5000, 20)
l5000 = bcd.di_borders(di5000)
write_stats(
    text_file,
    f"Numbers of CIDs at 5kb with 100kb window size: {len(l5000)}",
    "w",
)

# Compute CIDs at 2kb resolution with 50kb window size.
M2000 = cooler.Cooler(f"{mat_file}::/resolutions/{2000}").matrix(
    balance=True, sparse=True
)[:]
di2000 = bcd.directional_index(M2000, 25)
l2000 = bcd.di_borders(di2000)
write_stats(
    text_file, f"Numbers of CIDs at 2kb with 50kb window size: {len(l2000)}"
)

# Compute CIDs at 1kb resolution using insulation score.
M1000 = cooler.Cooler(f"{mat_file}::/resolutions/{1000}").matrix(
    balance=True, sparse=True
)[:]
di1000 = bcd.directional_index(M1000, 20)
l1000 = bcd.di_borders(di1000)
write_stats(
    text_file, f"Numbers of CIDs at 1kb with 50kb window size: {len(l1000)}"
)

mat = cooler.Cooler(f"{mat_file}::/resolutions/{1000}").matrix(
    balance=True, sparse=False
)[:]
mat[np.isnan(mat)] = 0
final_borders_1000, lri_1000 = bci.get_insulation_score(
    mat, [10, 15, 20, 25, 30]
)
write_stats(
    text_file,
    f"{len(final_borders_1000)} CIDs have been detected with insulation score at 1kb resolution.",
)


rna, _ = bcio.extract_big_wig(rna_file, ztransform=False)

annotation = pd.DataFrame(
    columns=["type", "start", "end", "strand", "gene_name", "rpkm"]
)

n = 0
with open(annotation_file, "r") as file:
    for line in file:
        if line.startswith("#"):
            continue
        elif line.startswith(">"):
            break
        else:
            line = line.split("\t")
            if line[2] in ["gene", "tRNA", "rRNA"]:
                if line[2] == "gene":
                    name = line[8].split("Name=")[-1].split(";")[0]
                    annot = {
                        "type": line[2],
                        "start": int(line[3]),
                        "end": int(line[4]),
                        "strand": line[6],
                        "gene_name": name,
                        "rpkm": np.mean(rna[int(line[3]) : int(line[4])]),
                    }
                    annotation = annotation.append(annot, ignore_index=True)


###
# Print matrices with CIDs borders (Fig 1e).
###
starts = [0, 2500, 3000]
ends = [len(mat), 3000, 3500]
col = ["blue", "lime", "cyan"]

for i in range(3):
    out_file, start, end = out_files_mat[i], starts[i], ends[i]

    vmax = 99
    title = None
    fig, ax = plt.subplots(1, 1, figsize=(7, 7), dpi=dpi)

    # Axis values
    scaling_factor = 1.0
    axis = "kb"

    # No end values given.
    if end == 0:
        end = len(mat)

    # Display plots
    im = ax.imshow(
        mat[start:end, start:end],
        cmap=cmap,
        vmin=0,
        vmax=np.nanpercentile(mat, vmax),
        extent=(
            start * scaling_factor,
            end * scaling_factor,
            end * scaling_factor,
            start * scaling_factor,
        ),
    )

    # Legend
    ax.set_xlabel(f"Genomic coordinates ({axis:s})", fontsize=16)
    ax.set_ylabel(f"Genomic coordinates ({axis:s})", fontsize=16)
    ax.tick_params(axis="both", labelsize=16)

    # Title
    if title is not None:
        ax.set_title(title, size=18)

    # Colorbar
    cbar = plt.colorbar(im, shrink=0.33, anchor=(0, 0.5))
    cbar.ax.tick_params(labelsize=16)

    first = True
    for i in [x * 5 for x in l5000]:
        if i in np.arange(start, end):
            #         plt.axvline(i, lw=1, ls='dashed', c='blue')
            if first:
                s = i
                first = False
                ax.axvline(
                    s,
                    ymin=(end - s) / (end - start),
                    ymax=1,
                    c=col[0],
                    lw=1,
                    zorder=0.5,
                )
                ax.axhline(
                    s,
                    xmin=0,
                    xmax=1 - ((end - s) / (end - start)),
                    c=col[0],
                    lw=1,
                    zorder=0.5,
                )
            else:
                e = i
                ax.add_patch(
                    patches.Rectangle(
                        (s, s), e - s, e - s, edgecolor=col[0], fill=False
                    )
                )
                s = i
    ax.axvline(
        s, ymin=(end - s) / (end - start), ymax=0, c=col[0], lw=1, zorder=0.5
    )
    ax.axhline(
        s,
        xmin=1,
        xmax=1 - ((end - s) / (end - start)),
        c=col[0],
        lw=1,
        zorder=0.5,
    )

    first = True
    for i in [x * 5 for x in l2000]:
        if i in np.arange(start, end):
            #         plt.axvline(i, lw=1, ls='dashed', c='cyan')
            if first:
                s = i
                first = False
                ax.axvline(
                    s,
                    ymin=(end - s) / (end - start),
                    ymax=1,
                    c=col[1],
                    lw=1,
                    zorder=0.5,
                )
                ax.axhline(
                    s,
                    xmin=0,
                    xmax=1 - ((end - s) / (end - start)),
                    c=col[1],
                    lw=1,
                    zorder=0.5,
                )
            else:
                e = i
                ax.add_patch(
                    patches.Rectangle(
                        (s, s), e - s, e - s, edgecolor=col[1], fill=False
                    )
                )
                s = i
    ax.axvline(
        s, ymin=(end - s) / (end - start), ymax=0, c=col[1], lw=1, zorder=0.5
    )
    ax.axhline(
        s,
        xmin=1,
        xmax=1 - ((end - s) / (end - start)),
        c=col[1],
        lw=1,
        zorder=0.5,
    )

    first = True
    for i in final_borders_1000:
        if i in np.arange(start, end):
            #         plt.axvline(i, lw=1, ls='dashed', c='magenta')
            if first:
                s = i
                first = False
                ax.axvline(
                    s,
                    ymin=(end - s) / (end - start),
                    ymax=1,
                    c=col[2],
                    lw=1,
                    zorder=0.5,
                )
                ax.axhline(
                    s,
                    xmin=0,
                    xmax=1 - ((end - s) / (end - start)),
                    c=col[2],
                    lw=1,
                    zorder=0.5,
                )
            else:
                e = i
                ax.add_patch(
                    patches.Rectangle(
                        (s, s), e - s, e - s, edgecolor=col[2], fill=False
                    )
                )
                s = i
    ax.axvline(
        s, ymin=(end - s) / (end - start), ymax=0, c=col[2], lw=1, zorder=0.5
    )
    ax.axhline(
        s,
        xmin=1,
        xmax=1 - ((end - s) / (end - start)),
        c=col[2],
        lw=1,
        zorder=0.5,
    )

    # Savefig
    plt.savefig(out_file, dpi=dpi, bbox_inches="tight")
    plt.close()


###
# Print full matrices with CIDs borders using DI only (Supp Fig 1f).
###

for i in range(3):
    out_file, start, end = out_files_mat_di[i], starts[i], ends[i]

    vmax = 99
    title = None
    fig, ax = plt.subplots(1, 1, figsize=(7, 7), dpi=dpi)

    # Axis values
    scaling_factor = 1.0
    axis = "kb"

    # No end values given.
    if end == 0:
        end = len(mat)

    # Display plots
    im = ax.imshow(
        mat[start:end, start:end],
        cmap=cmap,
        vmin=0,
        vmax=np.percentile(mat, vmax),
        extent=(
            start * scaling_factor,
            end * scaling_factor,
            end * scaling_factor,
            start * scaling_factor,
        ),
    )

    # Legend
    ax.set_xlabel(f"Genomic coordinates ({axis:s})", fontsize=16)
    ax.set_ylabel(f"Genomic coordinates ({axis:s})", fontsize=16)
    ax.tick_params(axis="both", labelsize=16)

    # Title
    if title is not None:
        ax.set_title(title, size=18)

    # Colorbar
    cbar = plt.colorbar(im, shrink=0.33, anchor=(0, 0.5))
    cbar.ax.tick_params(labelsize=16)

    first = True
    for i in [x * 5 for x in l5000]:
        if i in np.arange(start, end):
            #         plt.axvline(i, lw=1, ls='dashed', c='blue')
            if first:
                s = i
                first = False
                ax.axvline(
                    s,
                    ymin=(end - s) / (end - start),
                    ymax=1,
                    c=col[0],
                    lw=1,
                    zorder=0.5,
                )
                ax.axhline(
                    s,
                    xmin=0,
                    xmax=1 - ((end - s) / (end - start)),
                    c=col[0],
                    lw=1,
                    zorder=0.5,
                )
            else:
                e = i
                ax.add_patch(
                    patches.Rectangle(
                        (s, s), e - s, e - s, edgecolor=col[0], fill=False
                    )
                )
                s = i
    ax.axvline(
        s, ymin=(end - s) / (end - start), ymax=0, c=col[0], lw=1, zorder=0.5
    )
    ax.axhline(
        s,
        xmin=1,
        xmax=1 - ((end - s) / (end - start)),
        c=col[0],
        lw=1,
        zorder=0.5,
    )

    first = True
    for i in [x * 5 for x in l2000]:
        if i in np.arange(start, end):
            # plt.axvline(i, lw=1, ls='dashed', c='cyan')
            if first:
                s = i
                first = False
                ax.axvline(
                    s,
                    ymin=(end - s) / (end - start),
                    ymax=1,
                    c=col[1],
                    lw=1,
                    zorder=0.5,
                )
                ax.axhline(
                    s,
                    xmin=0,
                    xmax=1 - ((end - s) / (end - start)),
                    c=col[1],
                    lw=1,
                    zorder=0.5,
                )
            else:
                e = i
                ax.add_patch(
                    patches.Rectangle(
                        (s, s), e - s, e - s, edgecolor=col[1], fill=False
                    )
                )
                s = i
    ax.axvline(
        s, ymin=(end - s) / (end - start), ymax=0, c=col[1], lw=1, zorder=0.5
    )
    ax.axhline(
        s,
        xmin=1,
        xmax=1 - ((end - s) / (end - start)),
        c=col[1],
        lw=1,
        zorder=0.5,
    )

    first = True
    for i in [x * 5 for x in l1000]:
        if i in np.arange(start, end):
            # plt.axvline(i, lw=1, ls='dashed', c='cyan')
            if first:
                s = i
                first = False
                ax.axvline(
                    s,
                    ymin=(end - s) / (end - start),
                    ymax=1,
                    c=col[2],
                    lw=1,
                    zorder=0.5,
                )
                ax.axhline(
                    s,
                    xmin=0,
                    xmax=1 - ((end - s) / (end - start)),
                    c=col[2],
                    lw=1,
                    zorder=0.5,
                )
            else:
                e = i
                ax.add_patch(
                    patches.Rectangle(
                        (s, s), e - s, e - s, edgecolor=col[2], fill=False
                    )
                )
                s = i
    ax.axvline(
        s, ymin=(end - s) / (end - start), ymax=0, c=col[2], lw=1, zorder=0.5
    )
    ax.axhline(
        s,
        xmin=1,
        xmax=1 - ((end - s) / (end - start)),
        c=col[2],
        lw=1,
        zorder=0.5,
    )

    # Savefig
    plt.savefig(out_file, dpi=dpi, bbox_inches="tight")
    plt.close()

###
# Transcription at borders.
###


def get_borders_transcriptions(annotation, borders, size, step):
    borders_transcriptions = []
    for i in annotation.index:
        if annotation.loc[i, "strand"] == "-":
            pos = annotation.loc[i, "end"]
        else:
            pos = annotation.loc[i, "start"]
        for j in borders:
            a = j * size + step
            if abs(pos - a) < 2500:
                borders_transcriptions.append(annotation.loc[i, "rpkm"])
    return borders_transcriptions


borders_trans_5000 = get_borders_transcriptions(annotation, l5000, 5000, 2500)
borders_trans_2000 = get_borders_transcriptions(annotation, l2000, 5000, 1000)
borders_trans_1000 = get_borders_transcriptions(
    annotation, final_borders_1000, 1000, 500
)
data = {
    "RPKM (log)": np.log(
        list(annotation.rpkm)
        + list(np.log(borders_trans_5000))
        + list(np.log(borders_trans_2000))
        + list(np.log(borders_trans_1000))
    ),
    "Genes": list(np.repeat(f"All genes\nn={len(annotation)}", len(annotation)))
    + list(
        np.repeat(f"5kb\nn={len(borders_trans_5000)}", len(borders_trans_5000))
    )
    + list(
        np.repeat(f"2kb\nn={len(borders_trans_2000)}", len(borders_trans_2000))
    )
    + list(
        np.repeat(f"1kb\nn={len(borders_trans_1000)}", len(borders_trans_1000))
    ),
}
data = pd.DataFrame(data)
data.replace([np.inf, -np.inf, np.nan], 0, inplace=True)

sns.violinplot(x="Genes", y="RPKM (log)", data=data, palette="tab10")
plt.xlabel("Resolution of the matrix to detect the borders", size=14)
plt.ylabel("RPKM (log)", size=14)
plt.title(
    "Genes transcription of genes at less\nthan 5kb of a detected border",
    size=16,
)
plt.xticks(size=12)
plt.yticks(size=12)
plt.savefig(bor_trans, bbox_inches="tight")
plt.close()

write_stats(
    text_file,
    f"p-value between 5kb borders and whole genome: {st.mannwhitneyu(np.log(list(annotation.rpkm)), np.log(borders_trans_5000))[1]}",
)
write_stats(
    text_file,
    f"p-value between 2kb borders and whole genome: {st.mannwhitneyu(np.log(list(annotation.rpkm)), np.log(borders_trans_2000))[1]}",
)
write_stats(
    text_file,
    f"p-value between 1kb borders and whole genome: {st.mannwhitneyu(np.log(list(annotation.rpkm)), np.log(borders_trans_1000))[1]}",
)

##
# Transcription depending on distance borders
###

annotation["dist_5kb"] = 0
annotation["dist_2kb"] = 0
annotation["dist_1kb"] = 0
annotation["log1p_rpkm"] = np.log1p(annotation["rpkm"])
for i in annotation.index:
    value = np.inf
    if annotation.loc[i, "strand"] == "+":
        for x in l5000:
            pos = annotation.loc[i, "start"]
            if pos < x * 5000:
                a = x * 5000 - pos
            elif pos > (x + 1) * 5000:
                a = pos - (x + 1) * 5000
            else:
                a = -1
            value = min(a, value)
    else:
        for x in l5000:
            pos = annotation.loc[i, "end"]
            if pos < x * 5000:
                a = x * 5000 - pos
            elif pos > (x + 1) * 5000:
                a = pos - (x + 1) * 5000
            else:
                a = -1
            value = min(a, value)
    annotation.loc[i, "dist_5kb"] = value

for i in annotation.index:
    value = np.inf
    if annotation.loc[i, "strand"] == "+":
        for x in l2000:
            pos = annotation.loc[i, "start"]
            if pos < x * 5000:
                a = x * 5000 - pos
            elif pos > (x + 0.4) * 5000:
                a = pos - (x + 0.4) * 5000
            else:
                a = -1
            value = min(a, value)
    else:
        for x in l2000:
            pos = annotation.loc[i, "end"]
            if pos < x * 5000:
                a = x * 5000 - pos
            elif pos > (x + 0.4) * 5000:
                a = pos - (x + 0.4) * 5000
            else:
                a = -1
            value = min(a, value)
    annotation.loc[i, "dist_2kb"] = value

for i in annotation.index:
    value = np.inf
    if annotation.loc[i, "strand"] == "+":
        for x in final_borders_1000:
            pos = annotation.loc[i, "start"]
            if pos < x * 1000:
                a = x * 1000 - pos
            elif pos > (x + 1) * 1000:
                a = pos - (x + 1) * 1000
            else:
                a = -1
            value = min(a, value)
    else:
        for x in final_borders_1000:
            pos = annotation.loc[i, "end"]
            if pos < x * 1000:
                a = x * 1000 - pos
            elif pos > (x + 1) * 1000:
                a = pos - (x + 1) * 1000
            else:
                a = -1
            value = min(a, value)
    annotation.loc[i, "dist_1kb"] = value

bin_means_l3, bin_edges_l3, binnumber_l3 = st.binned_statistic(
    annotation.dist_1kb,
    annotation.rpkm,
    statistic="mean",
    bins=41,
    range=(-5000, 200000),
)
bin_means_l2, bin_edges_l2, binnumber_l2 = st.binned_statistic(
    annotation.dist_2kb,
    annotation.rpkm,
    statistic="mean",
    bins=41,
    range=(-5000, 200000),
)
bin_means_l1, bin_edges_l1, binnumber_l1 = st.binned_statistic(
    annotation.dist_5kb,
    annotation.rpkm,
    statistic="mean",
    bins=41,
    range=(-5000, 200000),
)


def bootstrap_down(data):
    l = np.zeros((10000))
    for i in range(10000):
        l[i] = np.mean(choices(data, k=len(data)))
    return np.percentile(l, 5)


def bootstrap_up(data):
    l = np.zeros((10000))
    for i in range(10000):
        l[i] = np.mean(choices(data, k=len(data)))
    return np.percentile(l, 95)


bin_perc5_l1, bin_edges_l1, binnumber_l1 = st.binned_statistic(
    annotation.dist_5kb,
    annotation.rpkm,
    statistic=bootstrap_down,
    bins=41,
    range=(-5000, 200000),
)
bin_perc95_l1, bin_edges_l1, binnumber_l1 = st.binned_statistic(
    annotation.dist_5kb,
    annotation.rpkm,
    statistic=bootstrap_up,
    bins=41,
    range=(-5000, 200000),
)
bin_perc5_l2, bin_edges_l2, binnumber_l2 = st.binned_statistic(
    annotation.dist_2kb,
    annotation.rpkm,
    statistic=bootstrap_down,
    bins=41,
    range=(-5000, 200000),
)
bin_perc95_l2, bin_edges_l2, binnumber_l2 = st.binned_statistic(
    annotation.dist_2kb,
    annotation.rpkm,
    statistic=bootstrap_up,
    bins=41,
    range=(-5000, 200000),
)
bin_perc5_l3, bin_edges_l3, binnumber_l3 = st.binned_statistic(
    annotation.dist_1kb,
    annotation.rpkm,
    statistic=bootstrap_down,
    bins=41,
    range=(-5000, 200000),
)
bin_perc95_l3, bin_edges_l3, binnumber_l3 = st.binned_statistic(
    annotation.dist_1kb,
    annotation.rpkm,
    statistic=bootstrap_up,
    bins=41,
    range=(-5000, 200000),
)

plt.plot(bin_edges_l1[1:] / 1000, bin_means_l1, label="5kb")
plt.fill_between(
    bin_edges_l1[1:] / 1000, bin_perc5_l1, bin_perc95_l1, alpha=0.5
)
plt.plot(bin_edges_l2[1:] / 1000, bin_means_l2, label="2kb")
plt.fill_between(
    bin_edges_l2[1:] / 1000, bin_perc5_l2, bin_perc95_l2, alpha=0.5
)
plt.plot(bin_edges_l3[1:] / 1000, bin_means_l3, label="1kb")
plt.fill_between(
    bin_edges_l3[1:] / 1000, bin_perc5_l3, bin_perc95_l3, alpha=0.5
)
plt.xlabel("Distance from the closest border (kb)", size=14)
plt.ylabel("Mean RPKM", size=14)
plt.xlim(0, 50)
plt.title(
    "Genes transcription depending on\nthe distance to the closest border",
    size=16,
)
plt.xticks(size=12)
plt.yticks(size=12)
plt.legend()
plt.savefig(trans_bor, bbox_inches="tight")
plt.close()
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
import bacchus.hic as bch
import bacchus.io as bcio
import bacchus.plot as bcp
import cooler
import copy
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import os
from os.path import join
import pandas as pd
from scipy import ndimage
import seaborn as sns

# Import snakemake parameters.
annotation_file = snakemake.input.annotation
mat_wt_file = str(snakemake.input.cool_wt)
mat_rf_file = str(snakemake.input.cool_rf)
rna_wt_file = snakemake.input.rna_wt
rna_rf_file = snakemake.input.rna_rf
cov_wt_file = snakemake.input.cov_wt
cov_rf_file = snakemake.input.cov_rf
gc_file = snakemake.input.gc
epod_file = snakemake.input.EPODs
res = int(snakemake.params.res)
cmap = snakemake.params.cmap
outdir = str(snakemake.params.outdir)
width = snakemake.params.width
pos = snakemake.params.positions

# Create outdir if necessary.
os.makedirs(outdir, exist_ok=True)


def import_annotation_gff(annotation_file):
    """Function to create a table of the gene positions from the gff file."""
    annotation = pd.DataFrame(
        columns=["type", "start", "end", "strand", "name"]
    )
    with open(annotation_file, "r") as file:
        for line in file:
            # Header.
            if line.startswith("#"):
                continue
            # Stop at the fasta sequences.
            elif line.startswith(">"):
                break
            else:
                line = line.split("\t")
                if line[2] in ["gene", "tRNA", "rRNA"]:
                    if line[2] == "gene":
                        name = line[8].split("Name=")[-1].split(";")[0]
                        # Extract gene position.
                        annot = {
                            "type": line[2],
                            "start": int(line[3]),
                            "end": int(line[4]),
                            "strand": line[6],
                            "gene_name": name,
                        }
                        annotation = annotation.append(annot, ignore_index=True)
    return annotation


# Import files.
annotation = import_annotation_gff(annotation_file)
mat_wt = cooler.Cooler(f"{mat_wt_file}::/resolutions/{res}").matrix(
    balance=True, sparse=False
)[:]
mat_wt[np.isnan(mat_wt)] = 0
mat_rf = cooler.Cooler(f"{mat_rf_file}::/resolutions/{res}").matrix(
    balance=True, sparse=False
)[:]
mat_rf[np.isnan(mat_rf)] = 0
rna_wt, _ = bcio.extract_big_wig(rna_wt_file, binning=100)
rna_rf, _ = bcio.extract_big_wig(rna_rf_file, binning=100)
cov_wt, _ = bcio.extract_big_wig(cov_wt_file, binning=1000)
cov_rf, _ = bcio.extract_big_wig(cov_wt_file, binning=1000)
gc_content = pd.read_csv(gc_file, sep="\t", header=None).iloc[:, 1]
epod = pd.read_csv(epod_file, sep="\t", header=None)

# Make the rotation of the matrix to plot them
mat_wt_rot = copy.copy(mat_wt)
mat_wt_rot = bch.interpolate_white_lines(mat_wt_rot)
mat_wt_rot[np.isnan(mat_wt_rot)] = 0
mat_wt_rot = ndimage.rotate(mat_wt_rot, 45, reshape=True)
mat_rf_rot = copy.copy(mat_rf)
mat_rf_rot = bch.interpolate_white_lines(mat_rf_rot)
mat_rf_rot[np.isnan(mat_rf_rot)] = 0
mat_rf_rot = ndimage.rotate(mat_rf_rot, 45, reshape=True)


def plot_region(
    M1,
    M1_rot,
    M2,
    M2_rot,
    rna1,
    rna2,
    cov1,
    cov2,
    annotation,
    binning,
    width,
    zoom_ini,
    outfile,
    split=False,
):
    rna_binning = 100
    pal = sns.color_palette("Paired")

    # Defined values to specify the borders of the matrices and tables
    zoom = [zoom // binning for zoom in zoom_ini]
    width = width / 1000
    row1 = len(M1_rot) // 2 - int(np.sqrt(2) * width)
    row2 = len(M1_rot) // 2 + int(np.sqrt(2) * width)
    col1 = int(zoom[0] * np.sqrt(2))
    col2 = int(zoom[1] * np.sqrt(2))
    annotation_zoom = annotation[
        np.logical_and(
            annotation.start > zoom_ini[0], annotation.end < zoom_ini[1]
        )
    ]
    ymax = np.nanmax(
        np.concatenate(
            (
                rna1[zoom_ini[0] // rna_binning : zoom_ini[1] // rna_binning],
                rna2[zoom_ini[0] // rna_binning : zoom_ini[1] // rna_binning],
            )
        )
    )
    max_cov = np.nanmax(cov1) * 1.02
    min_cov = np.nanmin(cov1) * 0.98

    # Define panels depending on split or not.
    if split and ymax > 1500:
        a = 4
        fig, ax = plt.subplots(
            7,
            2,
            figsize=(16, 12),
            gridspec_kw={"height_ratios": [1, 30, 1, 6, 12, 3, 3]},
            sharex=False,
        )

        # Parameters to merge panel 2 and 3.
        d = 0.02
        D = 0.04
        for j in range(2):
            ax[a - 1, j].set_xlim(zoom_ini[0] // 1000, zoom_ini[1] // 1000)
            ax[a - 1, j].tick_params(axis="both", labelsize=15)
            ax[a - 1, j].spines["bottom"].set_visible(False)
            ax[a - 1, j].spines["top"].set_visible(False)
            ax[a - 1, j].spines["right"].set_visible(False)
            ax[a - 1, j].set_ylim(ymax - 525, ymax)
            ax[a - 1, j].get_xaxis().set_visible(False)

            # Add the small cut between the two panels
            kwargs = dict(
                transform=ax[a - 1, j].transAxes, color="k", clip_on=False
            )
            ax[a - 1, j].plot((-d, +d), (-D, +D), **kwargs)  # top-left diagonal
            kwargs.update(
                transform=ax[a, j].transAxes
            )  # switch to the bottom axes
            ax[a, j].plot(
                (-d, +d), (1 - d, 1 + d), **kwargs
            )  # bottom-left diagonal

        # Plot the RNA on the split panel
        ax[a - 1, 0].fill_between(
            np.arange(
                zoom_ini[0] // 1000, zoom_ini[1] // 1000, rna_binning / 1000
            ),
            rna1[zoom_ini[0] // rna_binning : zoom_ini[1] // rna_binning],
            color="black",
        )
        ax[a - 1, 1].fill_between(
            np.arange(
                zoom_ini[0] // 1000, zoom_ini[1] // 1000, rna_binning / 1000
            ),
            rna2[zoom_ini[0] // rna_binning : zoom_ini[1] // rna_binning],
            color="black",
        )

    else:
        a = 3
        fig, ax = plt.subplots(
            6,
            2,
            figsize=(16, 12),
            gridspec_kw={"height_ratios": [1, 30, 1, 15, 3, 3]},
            sharex=False,
        )

    # Plot expressed genes - blue are forward - red are reversed
    for j in range(2):
        ax[0, j].set_xlim(zoom_ini[0], zoom_ini[1])
        ax[0, j].get_xaxis().set_visible(False)
        ax[0, j].get_yaxis().set_visible(False)
        ax[2, j].set_xlim(zoom_ini[0], zoom_ini[1])
        ax[2, j].get_xaxis().set_visible(False)
        ax[2, j].get_yaxis().set_visible(False)
    pos = 1
    for i in range(len(annotation_zoom)):
        # Extract annotation information
        annot = annotation_zoom.iloc[
            i,
        ]
        strand = annot.strand
        start = annot.start // rna_binning
        end = annot.end // rna_binning
        name = annot.gene_name
        # Defined color depending on the strand
        if strand == "+":
            color = pal.as_hex()[1]
        else:
            color = pal.as_hex()[5]
        # Print it only if it transcribed (10% most transcribed genes)
        if np.mean(rna1[start:end]) >= 120.7628998139441:
            pos = pos * -1
            for j in range(2):
                ax[0, j].add_patch(
                    patches.Rectangle(
                        (start * 100, 0),
                        (end - start) * 100,
                        1,
                        edgecolor=color,
                        facecolor=color,
                        fill=True,
                    )
                )
                ax[0, j].text(
                    x=(start + ((end - start) / 2)) * 100,
                    y=pos + 0.5,
                    s=name,
                    rotation=90 * pos,
                    wrap=True,
                )
    color = pal.as_hex()[3]
    for i in epod.index:
        start = epod.loc[i, 3]
        end = epod.loc[i, 4]
        if (
            (start > zoom_ini[0])
            and (start < zoom_ini[1])
            or (end > zoom_ini[0])
            and (end < zoom_ini[1])
        ):
            start = max(start, zoom_ini[0]) // rna_binning
            end = min(end, zoom_ini[1]) // rna_binning
            for j in range(2):
                ax[2, j].add_patch(
                    patches.Rectangle(
                        (start * 100, 0),
                        (end - start) * 100,
                        1,
                        edgecolor=color,
                        facecolor=color,
                        fill=True,
                    )
                )
    #                 ax[2, j].text(
    #                     x=(start + ((end - start) / 2)) * 100,
    #                     y=pos + 0.5,
    #                     s="EPOD",
    #                     rotation=90 * pos,
    #                     wrap=True,
    #                 )

    # Plot the matrices
    ax[1, 0].set_ylabel("Genomic distance (kb)", fontsize=15)
    for j in range(2):
        ax[1, j].get_xaxis().set_visible(False)
        ax[1, j].tick_params(axis="both", labelsize=15)

    # Plot the matrice 1
    im = ax[1, 0].imshow(
        M1_rot[row1:row2, col1:col2],
        cmap=cmap,
        interpolation="none",
        vmin=0,
        vmax=np.nanpercentile(M1[zoom[0] : zoom[1], zoom[0] : zoom[1]], 97),
        extent=(col1 / np.sqrt(2), col2 / np.sqrt(2), width, -width),
    )

    # Plot the matrice 2
    ax[1, 1].imshow(
        M2_rot[row1:row2, col1:col2],
        cmap=cmap,
        interpolation="none",
        vmin=0,
        vmax=np.nanpercentile(M1[zoom[0] : zoom[1], zoom[0] : zoom[1]], 97),
        extent=(col1 / np.sqrt(2), col2 / np.sqrt(2), width, -width),
    )

    # RNAseq legends and plot at the bottom panel.
    ax[a, 0].set_ylabel("RNA count (CPM)", fontsize=15)
    for j in range(2):
        #         ax[a, j].set_xlabel("Genomic coordinates (kb)", size=15)
        ax[a, j].get_xaxis().set_visible(False)
        ax[a, j].tick_params(axis="both", labelsize=15)
        ax[a, j].set_xlim(zoom_ini[0] // 1000, zoom_ini[1] // 1000)
        ax[a, j].spines["top"].set_visible(False)
        ax[a, j].spines["right"].set_visible(False)
        if split:
            if ymax > 1500:
                ax[a, j].set_ylim(0, 1050)
            else:
                ax[a, j].set_ylim(0, 1500)
        else:
            ax[a, j].set_ylim(0, ymax)

    # Plot RNA at the bottom panel
    ax[a, 0].fill_between(
        np.arange(zoom_ini[0] // 1000, zoom_ini[1] // 1000, rna_binning / 1000),
        rna1[zoom_ini[0] // rna_binning : zoom_ini[1] // rna_binning],
        color="black",
    )
    ax[a, 1].fill_between(
        np.arange(zoom_ini[0] // 1000, zoom_ini[1] // 1000, rna_binning / 1000),
        rna2[zoom_ini[0] // rna_binning : zoom_ini[1] // rna_binning],
        color="black",
    )

    # GC content
    gc_binning = 100
    for j in range(2):
        ax[a + 1, j].plot(
            np.arange(
                zoom_ini[0] // 1000, zoom_ini[1] // 1000, gc_binning / 1000
            ),
            gc_content[
                (zoom_ini[0] + 250)
                // gc_binning : (zoom_ini[1] + 250)
                // gc_binning
            ],
            color="black",
        )
        ax[a + 1, j].set_xlim(zoom_ini[0] // 1000, zoom_ini[1] // 1000)
        ax[a + 1, j].tick_params(axis="both", labelsize=15)
        ax[a + 1, j].get_xaxis().set_visible(False)
    ax[a + 1, 0].set_ylabel("GC", fontsize=15)

    # Coverage
    cov_binning = 1000
    ax[a + 2, 0].plot(
        np.arange(zoom_ini[0] // 1000, zoom_ini[1] // 1000, cov_binning / 1000),
        cov1[
            (zoom_ini[0] + 250)
            // cov_binning : (zoom_ini[1] + 250)
            // cov_binning
        ],
        color="black",
    )
    ax[a + 2, 1].plot(
        np.arange(zoom_ini[0] // 1000, zoom_ini[1] // 1000, cov_binning / 1000),
        cov2[
            (zoom_ini[0] + 250)
            // cov_binning : (zoom_ini[1] + 250)
            // cov_binning
        ],
        color="black",
    )
    for j in range(2):
        ax[a + 2, j].set_xlim(zoom_ini[0] // 1000, zoom_ini[1] // 1000)
        ax[a + 2, j].set_ylim(min_cov, max_cov)
        ax[a + 2, j].set_xlabel("Genomic coordinates (kb)", size=15)
        ax[a + 2, j].tick_params(axis="both", labelsize=15)
    ax[a + 2, 0].set_ylabel("HiC\ncoverage\n(CPM)", fontsize=15)

    # Colorbar
    cbar = fig.colorbar(
        im, ax=ax.ravel().tolist(), shrink=0.3, anchor=(1.2, 0.75)
    )
    cbar.ax.tick_params(labelsize=15)

    # Save the fig and adjust margins
    if split and ymax > 1500:
        plt.subplots_adjust(wspace=0.2, hspace=0.1)
    else:
        plt.subplots_adjust(wspace=0.2, hspace=0.1)
    plt.savefig(outfile, bbox_inches="tight", dpi=200)


for split in [True, False]:
    if split:
        outfile = join(
            outdir,
            f"region_{pos[0]}_{pos[1]}_split.pdf",
        )
    else:
        outfile = join(
            outdir,
            f"region_{pos[0]}_{pos[1]}.pdf",
        )
    position = [int(p) * 1000 for p in pos]
    plot_region(
        mat_wt,
        mat_wt_rot,
        mat_rf,
        mat_rf_rot,
        rna_wt,
        rna_rf,
        cov_wt,
        cov_rf,
        annotation,
        res,
        width,
        position,
        outfile,
        split=split,
    )
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
import os
import bacchus.blob as bcb
import bacchus.hic as bch
import bacchus.io as bcio
import numpy as np
import copy
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import scipy
import scipy.stats as st
import seaborn as sns
import cooler

mat_file = snakemake.input.mat
rna_file = snakemake.input.rna
cov_file = snakemake.input.cov
gc_file = snakemake.input.gc
frags_file = snakemake.input.frags
epod_file = snakemake.input.EPODs
out_bed = str(snakemake.output.bed)
text_file = str(snakemake.output.text_file)
gc_plot = str(snakemake.output.gc)
cov_plot = str(snakemake.output.cov)
RS_plot = str(snakemake.output.RS)
rna_plot = str(snakemake.output.rna)
dist_bar = str(snakemake.output.dist_bar)
dist_line = str(snakemake.output.dist_line)
venn_plot = str(snakemake.output.venn_plot)

# Create outdir if necessary.
os.makedirs(str(snakemake.params.outdir), exist_ok=True)


def write_stats(text_file, text, writing_type="a"):
    with open(text_file, writing_type) as out:
        out.write(f"{text}\n")


# Import data
mat = cooler.Cooler(f"{mat_file}::/resolutions/500").matrix(balance=True)[:]
mat[np.isnan(mat)] = 0
rna, _ = bcio.extract_big_wig(rna_file, ztransform=False)
rna500, _ = bcio.extract_big_wig(rna_file, binning=500, ztransform=False)
rna500 = np.log10(rna500[:-2] + 1e-10)
cov, _ = bcio.extract_big_wig(cov_file, binning=500)
cov = cov[:-2]
gc_content = pd.read_csv(gc_file, sep="\t", header=None).iloc[:, 1]
frags = pd.read_csv(frags_file, sep="\t")
epod_data = pd.read_csv(epod_file, sep="\t", header=None)
mask = bch.mask_white_line(mat)
mat[mask] = np.nan
mat[:, mask] = np.nan

# Save TIDs positions.
blobs, blob_score = bcb.find_blobs(
    mat, size=5, n_mads=10, refine=1 / 3, rna=rna
)
with open(out_bed, "w") as out:
    for blob in blobs:
        out.write(f"blob\t{blob.start * 500}\t{blob.end * 500}\n")

# Save stats on TIDs.
write_stats(
    text_file, f"Numbers of blobs without interpolation: {len(blobs)}", "w"
)
write_stats(
    text_file,
    f"Cumulative size of the blobs without interpolation: {np.sum([x.size for x in blobs]) * 0.5}kb",
)

# Mask bins within a TIDs or not.
blob_mask = np.repeat(False, len(mat))
for i in blobs:
    for j in range(i.start, i.end):
        blob_mask[j] = True
blob_mask_r = np.repeat(False, len(mat))
for i, v in enumerate(blob_mask):
    if v:
        blob_mask_r[i] = False
    else:
        blob_mask_r[i] = True

# GC content
gc_content = np.array(gc_content[np.arange(0, len(gc_content), 5)])
gc_blob = gc_content[blob_mask]
gc_other = gc_content[blob_mask_r]
data = {"GC": gc_content, "blobs": blob_mask}
data = pd.DataFrame(data)
sns.violinplot(x="blobs", y="GC", data=data, palette="tab10")
plt.xlabel("Blobs", size=14)
plt.ylabel("GC content %", size=14)
plt.title("GC content distribution in blobs", size=16)
plt.xticks(size=12)
plt.yticks(size=12)
plt.savefig(gc_plot, bbox_inches="tight")
plt.close()

write_stats(text_file, f"Blob GC content: {np.mean(gc_blob)}")
write_stats(text_file, f"Other GC content: {np.mean(gc_other)}")
write_stats(text_file, f"Global GC content: {np.mean(gc_content)}")
write_stats(text_file, f"T-test blob-other: {st.ttest_ind(gc_blob, gc_other)}")
write_stats(text_file, f"T-test blob-all: {st.ttest_ind(gc_blob, gc_content)}")

# Restriction sites
start_pos = frags.start_pos[1:]
frags = np.zeros(len(gc_content))
for i in start_pos:
    frags[i // 500] += 1
data["frags"] = frags

sns.boxplot(x="blobs", y="frags", data=data, palette="tab10")
plt.xlabel("Blobs", size=14)
plt.ylabel("Restriction site", size=14)
plt.title("Restriction sites distribution in blobs", size=16)
plt.xticks(size=12)
plt.yticks(size=12)
plt.savefig(RS_plot, bbox_inches="tight")
plt.close()

write_stats(text_file, f"Blob RS: {np.mean(frags[blob_mask])}")
write_stats(text_file, f"Other RS: {np.mean(frags[blob_mask_r])}")
write_stats(text_file, f"Global RS: {np.mean(frags)}")
write_stats(
    text_file,
    f"T-test blob-other: {st.ttest_ind(frags[blob_mask], frags[blob_mask_r])}",
)
write_stats(
    text_file, f"T-test blob-all: {st.ttest_ind(frags[blob_mask], frags)}"
)

# Coverage
data["cov"] = cov

sns.violinplot(x="blobs", y="cov", data=data, palette="tab10")
plt.xlabel("Blobs", size=14)
plt.ylabel("HiC Coverage (CPM)", size=14)
plt.title("Coverage distribution in blobs", size=16)
plt.xticks(size=12)
plt.yticks(size=12)
plt.savefig(cov_plot, bbox_inches="tight")
plt.close()

write_stats(text_file, f"Blob coverage: {np.mean(cov[blob_mask])}")
write_stats(text_file, f"Other coverage: {np.mean(cov[blob_mask_r])}")
write_stats(text_file, f"Global coverage: {np.mean(cov)}")
write_stats(
    text_file,
    f"T-test blob-other: {st.ttest_ind(cov[blob_mask], cov[blob_mask_r])}",
)
write_stats(text_file, f"T-test blob-all: {st.ttest_ind(cov[blob_mask], cov)}")

# Transcription
data["rna"] = rna500

sns.violinplot(x="blobs", y="rna", data=data, palette="tab10")
plt.xlabel("Blobs", size=14)
plt.ylabel("Transcription (CPM)", size=14)
plt.title("Transcription distribution in blobs", size=16)
plt.xticks(size=12)
plt.yticks(size=12)
plt.ylim(-8, 6)
plt.savefig(rna_plot, bbox_inches="tight")
plt.close()

write_stats(text_file, f"Blob transcription: {np.mean(rna500[blob_mask])}")
write_stats(text_file, f"Other transcription: {np.mean(rna500[blob_mask_r])}")
write_stats(text_file, f"Global transcription: {np.mean(rna500)}")
write_stats(
    text_file,
    f"T-test blob-other: {st.ttest_ind(rna500[blob_mask], rna500[blob_mask_r])}",
)
write_stats(
    text_file, f"T-test blob-all: {st.ttest_ind(rna500[blob_mask], rna500)}"
)

# TIDs distribution

tid_data = pd.read_csv(out_bed, sep="\t", header=None)
ori = (3925744 + 3925975) // 2
ter = 1590754
macrodomain = [x * 5000 for x in [45, 231, 393, 546, 685, 842]]
x = np.arange(0, 4_641_652, 500)
y = np.zeros(len(x))
for i in tid_data.index:
    start = tid_data.loc[i, 1]
    end = tid_data.loc[i, 2]
    for k in np.arange(start, end, 500):
        y[k // 500] = 1

# Bar plot
fig, ax = plt.subplots(figsize=(20, 5))
ax.fill_between(x / 1_000_000, y, color="k", alpha=1)
plt.xticks(size=16)
plt.xlabel("Genomics coordinates (Mb)", size=18)
ax.yaxis.set_visible(False)
ax.set_title("TIDs distribution", size=20)
for i in macrodomain[:-1]:
    plt.axvline(
        x=i / 1_000_000, c="green", linestyle="dashed", alpha=0.5, linewidth=3
    )
plt.axvline(
    x=macrodomain[-1] / 1_000_000,
    c="green",
    linestyle="dashed",
    alpha=0.5,
    label="macrodomain",
    linewidth=3,
)
plt.axvline(
    x=ori / 1_000_000,
    c="r",
    linestyle="dashed",
    alpha=0.5,
    label="ori",
    linewidth=3,
)
plt.axvline(
    x=ter / 1_000_000,
    c="blue",
    linestyle="dashed",
    alpha=0.5,
    label="ter",
    linewidth=3,
)
plt.legend()
plt.savefig(dist_bar)
plt.close()

# Line plot
bins = np.arange(0, 4_641_652, 50_000)
hist, x_bins = np.histogram(x, bins, weights=y)
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(x_bins[:-1] / 1_000_000, hist * 1, color="k")
ax.set_xlabel("Genomic coordinates (Mb)", size=16)
ax.set_ylabel("Ratio of TID in a 10kb bin (%)", size=16)
ax.tick_params(labelsize=16)
ax.set_title("TIDs distribution", size=20)
ax.set_xlim(0, 4_641_652 / 1_000_000)
for i in macrodomain[:-1]:
    plt.axvline(x=i / 1_000_000, c="k", linestyle="dashed", alpha=0.5)
plt.axvline(
    x=macrodomain[-1] / 1_000_000,
    c="k",
    linestyle="dashed",
    alpha=0.5,
    label="macrodomain",
)
plt.axvline(
    x=ori / 1_000_000, c="r", linestyle="dashed", alpha=0.5, label="ori"
)
plt.axvline(
    x=ter / 1_000_000, c="blue", linestyle="dashed", alpha=0.5, label="ter"
)
plt.legend()
plt.savefig(dist_line)
plt.close()

# Venn Diagramm

x = np.arange(0, 4_641_652, 5)
y_tid = np.zeros(len(x))
y_epod = np.zeros(len(x))
for i in tid_data.index:
    start = tid_data.loc[i, 1]
    end = tid_data.loc[i, 2]
    for k in np.arange(start, end, 5):
        y_tid[k // 5] = 1
for i in epod_data.index:
    start = epod_data.loc[i, 3]
    end = epod_data.loc[i, 4]
    for k in np.arange(start, end, 5):
        y_epod[k // 5] = 1
epod_only = 0
tid_only = 0
both = 0
none = 0
for i in x:
    k = i // 5
    if (y_epod[k] == 1) & (y_tid[k] == 1):
        both += 5
    elif (y_epod[k] == 0) & (y_tid[k] == 1):
        tid_only += 5
    elif (y_epod[k] == 1) & (y_tid[k] == 0):
        epod_only += 5
    elif (y_epod[k] == 0) & (y_tid[k] == 0):
        none += 5

venn2(
    subsets=(epod_only // 1000, tid_only // 1000, both // 1000),
    set_labels=("EPODs", "TIDs"),
)
plt.savefig(venn_plot)
plt.close()

MAGqual is a command line tool to evaluate the quality of metagenomic bins and generate recommended metadata in line with the MIMAG standards (v0.1.1-beta)

 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
import pandas as pd
import glob
import argparse
import os
from shutil import copyfile
import seaborn as sns
import matplotlib.pyplot as plt

def qual_cluster(comp, cont):
    if (comp >90) and (cont<5):
        qual = "High Quality"
    elif (comp >= 50) and (cont <10):
        qual = "Medium Quality"
    elif (comp <50) and (cont <10):
        qual = "Low Quality"
    else:
        qual = "Failed"
    return qual

def gen_qual(comp, cont):
    if (comp - (cont*5)) >= 50:
        genome = "y"
    else:
        genome = "n"
    return genome

def bsearch(bakta_loc, cluster): 
    length = 0
    loc = str(bakta_loc + str(cluster) + '/' + str(cluster) + '.txt') #don't copy this bit change it !
    # loc = str(str(cluster) + '.txt') #don't copy this bit change it !
    for name in glob.glob(loc):
        bakta_file = name
        with open(bakta_file, 'r') as bakta_log:
            for line in bakta_log:
                if "Length:" in line:
                    length = int(line.split(":")[1])
                if "Count:" in line: 
                    count = int(line.split(":")[1])
                if "N50:" in line:
                    N50 = int(line.split(":")[1])
                if "Software:" in line: 
                    software = line.split(":")[1].strip()
                if "Database" in line:
                    db = line.split(":")[1].strip()
    bakta_v = "Bakta " + software + " DB " + db 
    row_dict = {"length" : length, "contigs" : count, "N50": N50, "bakta_v" : bakta_v}
    return pd.Series(row_dict)

parser = argparse.ArgumentParser(description='')
parser.add_argument('output', help='output directory for the organised bins', type=str)
parser.add_argument('checkm_log', help='checkm output log file (TAB SEPARATED', type=str)
parser.add_argument('bakta_loc', help='directory containing all bakta output for all clusters', type=str)
parser.add_argument('seqkit_log', help='file containing the seqkit output for all clusters', type=str)
parser.add_argument('bin_loc', help='directory containing fasta files for all clusters', type=str)
parser.add_argument('jobid', help='prefix for current jobs', type=str)

args = parser.parse_args()
output = args.output
checkm_log = args.checkm_log
bakta_loc = args.bakta_loc
seqkit_log = args.seqkit_log
bin_loc = args.bin_loc
job_id = args.jobid

comp_software = "CheckM v.1.0.13"
comp_approach = "Marker gene"

colour_dict = dict({'High Quality':'#50C5B7',
                  'Near Complete':'#9CEC5B',
                  'Medium Quality': '#F0F465',
                  'Low Quality': "#F8333C",
                  'Failed': '#646E78'})

# =============================================================================
# CHECKM PARSE
# =============================================================================
checkm_df = pd.read_csv(checkm_log, sep = "\t")
checkm_df['Quality'] = checkm_df.apply(lambda x: qual_cluster(x['Completeness'], x['Contamination']), axis=1)
checkm_df[['Size_bp', 'No_contigs', 'N50_length', '16S_Software']] = checkm_df.apply(lambda x: bsearch(bakta_loc, x["Bin Id"]), axis = 1)

checkm_df = checkm_df.set_index("Bin Id")

high_clusters = checkm_df[checkm_df['Quality'].str.contains("High Quality")].index.values.tolist()
med_qual_clusters = checkm_df[checkm_df['Quality'].str.contains("Medium Quality")].index.values.tolist()
low_qual_clusters = checkm_df[checkm_df['Quality'].str.contains("Low Quality")].index.values.tolist()
NA = checkm_df[checkm_df['Quality'].str.contains("Failed")].index.values.tolist()
all_clusters = high_clusters + med_qual_clusters + low_qual_clusters + NA

checkm_df = checkm_df.drop(checkm_df.columns[[1, 2, 3, 4, 5, 6, 7, 8, 9]], axis=1)

# =============================================================================
# SEQKIT PARSE
# =============================================================================
seqkit_df = pd.read_csv(seqkit_log, sep = "\t")
seqkit_df = seqkit_df[["file", "max_len"]]
seqkit_df["file"] = seqkit_df["file"].str.replace('.fasta','', regex=True)
seqkit_df["file"] = seqkit_df["file"].str.split("/").str[-1]
seqkit_df.set_index("file", inplace=True)
seqkit_df.rename(columns={"max_len":"Max_contig_length"}, inplace = True)
checkm_df = pd.merge(checkm_df, seqkit_df, left_index=True, right_index=True, how="left")

# =============================================================================
# BAKTA PARSE
# =============================================================================

high_qual_clusters= []
near_comp_clusters = []
r16s_comp_clusters = []
trna_num = {}
for cluster in all_clusters:
    loc = str(bakta_loc + cluster + '/' + cluster + '.tsv') #change this too 
    # loc = str(str(cluster) + '.tsv') #don't copy this bit change it !
    for name in glob.glob(loc):
        bakta_file = name
        with open(bakta_file, 'r') as bakta_in:
            trna_set = set()
            rna_set = set()
            rrna_16S = "N"
            for line in bakta_in:
                if "tRNA-Ala" in line:
                    trna_set.add("tRNA-Ala")
                if "tRNA-Arg" in line:
                    trna_set.add("tRNA-Arg")
                if "tRNA-Asn" in line:
                    trna_set.add("tRNA-Asn")
                if "tRNA-Asp" in line:
                    trna_set.add("tRNA-Asp")
                if "tRNA-Cys" in line:
                    trna_set.add("tRNA-Cys")
                if "tRNA-Gln" in line:
                    trna_set.add("tRNA-Gln")
                if "tRNA-Glu" in line:
                    trna_set.add("tRNA-Glu")
                if "tRNA-Gly" in line:
                    trna_set.add("tRNA-Gly")
                if "tRNA-His" in line:
                    trna_set.add("tRNA-His")
                if "tRNA-Ile" in line:
                    trna_set.add("tRNA-Ile")
                if "tRNA-Leu" in line:
                    trna_set.add("tRNA-Leu")
                if "tRNA-Lys" in line:
                    trna_set.add("tRNA-Lys")
                if "tRNA-Met" in line:
                    trna_set.add("tRNA-Met")
                if "tRNA-Phe" in line:
                    trna_set.add("tRNA-Phe")
                if "tRNA-Pro" in line:
                    trna_set.add("tRNA-Pro")
                if "tRNA-Ser" in line:
                    trna_set.add("tRNA-Ser")
                if "tRNA-Thr" in line:
                    trna_set.add("tRNA-Thr")
                if "tRNA-Trp" in line:
                    trna_set.add("tRNA-Trp")
                if "tRNA-Tyr" in line:
                    trna_set.add("tRNA-Tyr")
                if "tRNA-Val" in line:
                    trna_set.add("tRNA-Val")
                if ("5S ribosomal RNA" in line) and ("partial" not in line):
                    rna_set.add('5S')
                if ("16S ribosomal RNA" in line) and ("partial" not in line):
                    rna_set.add('16S')
                    rrna_16S = "Y"
                if ("23S ribosomal RNA" in line) and ("partial" not in line):
                    rna_set.add('23s')
        if rrna_16S == "Y":
            r16s_comp_clusters.append(cluster)
        if cluster in high_clusters:
            if (len(trna_set) >= 18) and (len(rna_set) == 3):
                high_qual_clusters.append(cluster)
            else:
                near_comp_clusters.append(cluster) # adds high qual that fail trna/rna
        trna_num.update({cluster : len(trna_set)})


# =============================================================================
# Add these new qualities to the checkm dataframe        
# =============================================================================

checkm_df.loc[checkm_df.index.isin(near_comp_clusters), "Quality"] = "Near Complete"

checkm_df.loc[checkm_df.index.isin(r16s_comp_clusters), "16S_Recovered"] = "Y"
checkm_df["16S_Recovered"] = checkm_df["16S_Recovered"].fillna("N")

# Sort out legend order
label_order = ["High Quality", "Near Complete", "Medium Quality", "Low Quality", "Failed"]
labels_all = checkm_df["Quality"].unique().tolist()

for item in label_order:
    if item not in labels_all:
        label_order.remove(item)

labels_list = label_order

# =============================================================================
# Basic plot
# =============================================================================

checkm_df["Purity"] = 100 - checkm_df["Contamination"]
plt.figure(figsize=(15, 10))
ax = sns.scatterplot(data = checkm_df, x="Completeness", y="Purity",
                hue='Quality', size = 'Size_bp', sizes=(20, 800), alpha = 0.6, 
                palette=colour_dict, hue_order= labels_list)
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

plt.xlabel('Completeness (%)', size = 24)
plt.ylabel('Purity (%)', size = 24)
plt.xlim(0)
plt.legend(prop={'size': 20})
plt.tight_layout()
plt.tick_params(labelsize=18)

plt.savefig(job_id + '_mag_qual.png')
# =============================================================================
# COPYING FILES INTO QUAL DIRECTORIES
# =============================================================================

location = bin_loc
new_loc = output + "/" + job_id + "/"
os.makedirs(new_loc + "high_qual", exist_ok=True)
os.makedirs(new_loc + "near_comp", exist_ok=True)
os.makedirs(new_loc + "med_qual", exist_ok=True)
os.makedirs(new_loc + "low_qual", exist_ok=True)
os.makedirs(new_loc + "failed", exist_ok=True)

for high in high_qual_clusters:
    file = location + high + ".fasta"
    copyfile(file, new_loc +"high_qual/"+high+".fasta")

for nc in near_comp_clusters:
    file = location + nc + ".fasta"
    copyfile(file, new_loc +"near_comp/"+nc+".fasta")

for med in med_qual_clusters:
    file = location + med + ".fasta"
    copyfile(file, new_loc+"med_qual/"+med+".fasta")

for low in low_qual_clusters:
    file = location + low + ".fasta"
    copyfile(file, new_loc+"low_qual/"+low+".fasta")

for NA_bin in NA:
    file = location + NA_bin + ".fasta"
    copyfile(file, new_loc+"failed/"+NA_bin+".fasta")

# =============================================================================
# OUTPUT CREATED HERE
# =============================================================================

magqual_df = checkm_df[["Quality", "Completeness", "Contamination", "16S_Recovered", "16S_Software", "Size_bp", "No_contigs", "N50_length", "Max_contig_length"]].copy()
magqual_df["tRNA_Extracted"] = pd.Series(trna_num)
magqual_df["tRNA_Software"] = magqual_df["16S_Software"]
magqual_df["Completeness_Approach"] = comp_approach
magqual_df["Completeness_Software"] = comp_software

magqual_df = magqual_df.reindex(columns=["Quality", "Completeness", "Contamination", "Completeness_Software","Completeness_Approach", "16S_Recovered", "16S_Software", "tRNA_Extracted", "tRNA_Software", "Size_bp", "No_contigs", "N50_length", "Max_contig_length"])

magqual_df.to_csv("analysis/" + job_id + "_mag_qual_statistics.csv")

print("-" * 12)
print(" NUMBER MAGs")
print("-" * 12)
print("High Quality:", len(high_qual_clusters))
print("Near Complete:", len(near_comp_clusters))
print("Med Quality:", len(med_qual_clusters))
print("Low Quality:", len(low_qual_clusters))
print("Failed:", len(NA), "\n")
print("-" * 12)
print(" MAG IDs")
print("-" * 12)
print("High Quality: " , ", ".join([str(x) for x in high_qual_clusters]))
print("Near Complete: ", ", ".join([str(x) for x in near_comp_clusters]))
print("Med Quality: ", ", ".join([str(x) for x in med_qual_clusters]))
print("Low Quality: ", ", ".join([str(x) for x in low_qual_clusters]))
print("Failed: ", ", ".join([str(x) for x in NA]))

Snakemake pipelines for replicating sstar analysis

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib
matplotlib.use("Agg")

import seaborn as sns
sns.set_style("darkgrid")

from sklearn import metrics


sstar_1src_accuracy = pd.read_csv(snakemake.input.sstar_1src_accuracy, sep="\t").dropna()
sprime_1src_accuracy = pd.read_csv(snakemake.input.sprime_1src_accuracy, sep="\t").dropna()
skovhmm_1src_accuracy = pd.read_csv(snakemake.input.skovhmm_1src_accuracy, sep="\t").dropna()

sstar_1src_accuracy_grouped = sstar_1src_accuracy.groupby(['demography', 'scenario', 'sample', 'cutoff'], as_index=False)
sprime_1src_accuracy_grouped = sprime_1src_accuracy.groupby(['demography', 'sample', 'cutoff'], as_index=False)
skovhmm_1src_accuracy_grouped = skovhmm_1src_accuracy.groupby(['demography', 'sample', 'cutoff'], as_index=False)

sstar_1src_accuracy_mean = sstar_1src_accuracy_grouped.mean()
sprime_1src_accuracy_mean = sprime_1src_accuracy_grouped.mean()
skovhmm_1src_accuracy_mean = skovhmm_1src_accuracy_grouped.mean()

sstar_1src_accuracy_mean.to_csv(snakemake.output.sstar_1src_accuracy_mean, sep="\t", index=False)
sprime_1src_accuracy_mean.to_csv(snakemake.output.sprime_1src_accuracy_mean, sep="\t", index=False)
skovhmm_1src_accuracy_mean.to_csv(snakemake.output.skovhmm_1src_accuracy_mean, sep="\t", index=False)

sprime_1src_accuracy_mean['scenario'] = ['true'] * len(sprime_1src_accuracy_mean)
skovhmm_1src_accuracy_mean['scenario'] = ['true'] * len(skovhmm_1src_accuracy_mean)

sstar_2src_accuracy = pd.read_csv(snakemake.input.sstar_2src_accuracy, sep="\t").dropna()
sprime_2src_accuracy = pd.read_csv(snakemake.input.sprime_2src_accuracy, sep="\t").dropna()
archaicseeker2_2src_accuracy = pd.read_csv(snakemake.input.archaicseeker2_2src_accuracy, sep="\t").dropna()

sstar_2src_accuracy_grouped = sstar_2src_accuracy.groupby(['demography', 'sample', 'cutoff', 'src'], as_index=False)
sprime_2src_accuracy_grouped = sprime_2src_accuracy.groupby(['demography', 'sample', 'cutoff', 'src'], as_index=False)
archaicseeker2_2src_accuracy_grouped = archaicseeker2_2src_accuracy.groupby(['demography', 'sample', 'cutoff', 'src'], as_index=False)

sstar_2src_accuracy_mean = sstar_2src_accuracy_grouped.mean()
sprime_2src_accuracy_mean = sprime_2src_accuracy_grouped.mean()
archaicseeker2_2src_accuracy_mean = archaicseeker2_2src_accuracy_grouped.mean()

sstar_2src_accuracy_mean.to_csv(snakemake.output.sstar_2src_accuracy_mean, sep="\t", index=False)
sprime_2src_accuracy_mean.to_csv(snakemake.output.sprime_2src_accuracy_mean, sep="\t", index=False)
archaicseeker2_2src_accuracy_mean.to_csv(snakemake.output.archaicseeker2_2src_accuracy_mean, sep="\t", index=False)

methods1 = ['sstar', 'sprime', 'skovhmm']
demography1 = ['HumanNeanderthal', 'BonoboGhost']
samples = ['nref_10_ntgt_1', 'nref_50_ntgt_1']
scenarios = ['true', 'const', 'ref_tgt_only']
accuracy1 = {
    'sstar': sstar_1src_accuracy_mean,
    'sprime': sprime_1src_accuracy_mean,
    'skovhmm': skovhmm_1src_accuracy_mean,
}
methods2 = [
    'sstar', 
    'sprime', 
    'archaicseeker2'
]
demography2 = ['HumanNeanderthalDenisovan', 'ChimpBonoboGhost']
accuracy2 = {
    'sstar': sstar_2src_accuracy_mean,
    'sprime': sprime_2src_accuracy_mean,
    'archaicseeker2': archaicseeker2_2src_accuracy_mean,
}

fig, axs = plt.subplots(nrows=2, ncols=3, constrained_layout=True, figsize=(7.5,4), dpi=350)
gridspec = axs[0, 0].get_subplotspec().get_gridspec()
for a in axs[:,2]:
    a.remove()

markers = {
    'nref_10_ntgt_1': {'symbol':'.', 'size': 6},
    'nref_50_ntgt_1': {'symbol':'*', 'size': 6},
}

colors = {
    'sstar': {'true': 'blue', 'const': 'cyan', 'ref_tgt_only': 'purple'},
    'skovhmm': 'green',
    'sprime': 'orange',
    'archaicseeker2': 'magenta',
}

linestyles = {
    'const': 'dotted',
    'true': 'solid',
    'ref_tgt_only': (0, (3, 1, 1, 1, 1, 1)),
}

titles = {
    'HumanNeanderthal': 'Human-Neanderthal model',
    'BonoboGhost': 'Bonobo-Ghost model',
    'HumanNeanderthalDenisovan': 'Human-Neanderthal-Denisovan model',
    'ChimpBonoboGhost': 'Chimpanzee-Ghost-Bonobo model',
}

zorders = {
    'sstar': 2,
    'skovhmm': 5,
    'sprime': 10,
}

j = 0
for d in demography1:
    for s in samples:
        for sc in scenarios:
            for m in methods1:
                if m == 'sstar': color = colors[m][sc]
                else: color = colors[m]
                df = accuracy1[m][
                        (accuracy1[m]['demography'] == d) &
                        (accuracy1[m]['sample'] == s) &
                        (accuracy1[m]['scenario'] == sc)
                    ].sort_values(by='recall', ascending=False)
                recall = df['recall']
                precision = df['precision']
                if (m == 'sprime') or (m == 'skovhmm'):
                    if sc != 'true': continue

                if d == 'BonoboGhost':
                    axs[0,j].plot(recall, precision,
                        marker=markers[s]['symbol'], ms=markers[s]['size'],
                        c=color, zorder=zorders[m])
                else:
                    axs[0,j].plot(recall, precision,
                        marker=markers[s]['symbol'], ms=markers[s]['size'],
                        c=color)

    axs[0,j].set_xlabel('Recall (%)', fontsize=10)
    axs[0,j].set_ylabel('Precision (%)', fontsize=10)
    axs[0,j].set_xlim([-5, 105])
    axs[0,j].set_ylim([-5, 105])
    axs[0,j].set_title(titles[d], fontsize=8, weight='bold')
    if j == 0: 
        axs[0,j].text(-35, 110, 'B', fontsize=10, weight='bold')
        axs[0,j].plot([0,100],[2.25,2.25], c='red', alpha=0.5)
    if j == 1: 
        axs[0,j].text(-35, 110, 'C', fontsize=10, weight='bold')
        axs[0,j].plot([0,100],[2,2], c='red', alpha=0.5)

    f_scores = np.linspace(20, 80, num=4)
    lines, labels = [], []
    for f_score in f_scores:
        x = np.linspace(1, 100)
        y = f_score * x / (2 * x - f_score)
        (l,) = axs[0,j].plot(x[y >= 0], y[y >= 0], color="black", alpha=0.4, linestyle='dotted', zorder=1)
        axs[0,j].annotate("F1={0:0.0f}%".format(f_score), xy=(101, y[45] + 2), fontsize=8)

    j += 1

j = 0
for d in demography2:
    for s in samples:
        for m in methods2:
            if m == 'sstar': color = colors[m]['true']
            else: color = colors[m]
            src1_df = accuracy2[m][
                         (accuracy2[m]['demography'] == d) &
                         (accuracy2[m]['sample'] == s) &
                         (accuracy2[m]['src'] == 'src1')
                     ].sort_values(by='recall', ascending=False)
            src2_df = accuracy2[m][
                         (accuracy2[m]['demography'] == d) &
                         (accuracy2[m]['sample'] == s) &
                         (accuracy2[m]['src'] == 'src2')
                     ].sort_values(by='recall', ascending=False)
            src1_recall = src1_df['recall']
            src1_precision = src1_df['precision']
            src2_recall = src2_df['recall']
            src2_precision = src2_df['precision']
            axs[1,j].plot(src1_recall, src1_precision,
                     marker=markers[s]['symbol'], ms=markers[s]['size'],
                     c=color, markerfacecolor='white')
            axs[1,j].plot(src2_recall, src2_precision,
                     marker=markers[s]['symbol'], ms=markers[s]['size'],
                     c=color, linestyle='dashdot')

    axs[1,j].set_xlabel('Recall (%)', fontsize=10)
    axs[1,j].set_ylabel('Precision (%)', fontsize=10)
    axs[1,j].set_xlim([-5, 105])
    axs[1,j].set_ylim([-5, 105])
    axs[1,j].set_title(titles[d], fontsize=8, weight='bold')
    if j == 0: 
        axs[1,j].text(-35, 110, 'D', fontsize=10, weight='bold')
        axs[1,j].plot([0,100],[0.2,0.2], c='red', alpha=0.5)
        axs[1,j].plot([0,100],[4,4], c='red', linestyle='dotted')
    if j == 1: 
        axs[1,j].text(-35, 110, 'E', fontsize=10, weight='bold')
        axs[1,j].plot([0,100],[2,2], c='red', alpha=0.5)
        axs[1,j].plot([0,100],[2,2], c='red', linestyle='dotted')

    f_scores = np.linspace(20, 80, num=4)
    lines, labels = [], []
    for f_score in f_scores:
        x = np.linspace(1, 100)
        y = f_score * x / (2 * x - f_score)
        (l,) = axs[1,j].plot(x[y >= 0], y[y >= 0], color="black", alpha=0.4, linestyle='dotted', zorder=1)
        axs[1,j].annotate("F1={0:0.0f}%".format(f_score), xy=(101, y[45] + 2), fontsize=8)

    j += 1

# legend
subfig = fig.add_subfigure(gridspec[:,2])
handles, labels = subfig.gca().get_legend_handles_labels()
sstar_line = plt.Line2D([0], [0], label='sstar (full)', color=colors['sstar']['true'])
sstar_line2 = plt.Line2D([0], [0], label='sstar (constant)', color=colors['sstar']['const'])
sstar_line3 = plt.Line2D([0], [0], label='sstar (only ref & tgt)', color=colors['sstar']['ref_tgt_only'])
skovhmm_line = plt.Line2D([0], [0], label='SkovHMM', color=colors['skovhmm'])
sprime_line = plt.Line2D([0], [0], label='SPrime', color=colors['sprime'])
archaicseeker2_line = plt.Line2D([0], [0], label='ArchaicSeeker2.0', color=colors['archaicseeker2'])
baseline1 = plt.Line2D([0], [0], label='baseline/src1 baseline', color='red', alpha=0.5)
baseline2 = plt.Line2D([0], [0], label='src2 baseline', color='red', linestyle='dotted')
f1_curves = plt.Line2D([0], [0], label='iso-F1 curves', color='black', alpha=0.4, linestyle='dotted')
nref_10_ntgt_1 = plt.Line2D([0], [0], marker=markers['nref_10_ntgt_1']['symbol'],
                            ms=5, label='Nref = 10', color='black', linewidth=0)
nref_50_ntgt_1 = plt.Line2D([0], [0], marker=markers['nref_50_ntgt_1']['symbol'],
                            ms=5, label='Nref = 50', color='black', linewidth=0)
src1 = plt.Line2D([0], [0], label='src1', color='black', marker='o', ms=4, markerfacecolor='white')
src2 = plt.Line2D([0], [0], label='src2', color='black', marker='o', ms=4, linestyle='dotted')

handles.extend([sstar_line, sstar_line2, sstar_line3, skovhmm_line, sprime_line, archaicseeker2_line,
                baseline1, baseline2, f1_curves, nref_10_ntgt_1, nref_50_ntgt_1, src1, src2])
subfig.legend(handles=handles, fontsize=8, handlelength=1.5)

fig.set_constrained_layout_pads(w_pad=4 / 72, h_pad=4 / 72, hspace=0, wspace=0.1)
plt.savefig(snakemake.output.accuracy, bbox_inches='tight')

Snakemake workflow: dna-seq-gatk-variant-calling

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import common
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

calls = pd.read_table(snakemake.input[0], header=[0, 1])
samples = [name for name in calls.columns.levels[0] if name != "VARIANT"]
sample_info = calls.loc[:, samples].stack([0, 1]).unstack().reset_index(1, drop=False)
sample_info = sample_info.rename({"level_1": "sample"}, axis=1)

sample_info = sample_info[sample_info["DP"] > 0]
sample_info["freq"] = sample_info["AD"] / sample_info["DP"]
sample_info.index = np.arange(sample_info.shape[0])

plt.figure()

sns.stripplot(x="sample", y="freq", data=sample_info, jitter=True)
plt.ylabel("allele frequency")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.freqs)

plt.figure()

sns.stripplot(x="sample", y="DP", data=sample_info, jitter=True)
plt.ylabel("read depth")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.depths)
tool / pypi

seaborn

-------------------------------------- seaborn: statistical data visualization ======================================= [![PyPI Version](https://img.shields.io/pypi/v/seaborn.svg)](https://pypi.org/project/seaborn/) [![License](https://img.shields.io/pypi/l/seaborn.svg)](https://github.com/mwaskom/seaborn/blob/master/LICENSE) [![DOI](https://joss.theoj.org/papers/10.21105/joss.03021/status.svg)](https://doi.org/10.21105/joss.03021) [![Tests](https://github.com/mwaskom/seaborn/workflows/CI/badge.svg)](https://github.com/mwaskom/seaborn/actions) [![Code Coverage](https://codecov.io/gh/mwaskom/seaborn/branch/master/graph/badge.svg)](https://codecov.io/gh/mwaskom/seaborn) Seaborn is a Python visualization library based on matplotlib. It provides a high-level interface for drawing attractive statistical graphics. Documentation ------------- Online documentation is available at [seaborn.pydata.org](https://seaborn.pydata.org). The docs include a [tutorial](https://seaborn.pydata.org/tutorial.html), [example