Workflow Steps and Code Snippets

249 tagged steps and code snippets that match keyword scipy

In this repository, we present the code for the analysis of study of the transcription's impact on Escherichia coli chromosome.

  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
import bacchus.directional as bcd
import cooler
import matplotlib.pyplot as plt
import numpy as np
import os
import scipy.sparse as sp

mat_file = snakemake.input.mat
res = snakemake.params.res
cmap = snakemake.params.cmap
macro_output = str(snakemake.output.macrodomain)
CID_output = str(snakemake.output.cid)
out_file = str(snakemake.output.compare)

# Create outdir if necessary.
os.makedirs(str(snakemake.params.outdir), exist_ok=True)

mat = cooler.Cooler(f"{mat_file}::/resolutions/{res}").matrix(
    balance=True, sparse=True
)[:]

# Positions of the CIDs borders from Lioy et al., Cell, 2018.
lioy = [
    40,
    145,
    220,
    440,
    510,
    760,
    980,
    1145,
    1195,
    1300,
    1585,
    1670,
    1795,
    1845,
    2100,
    2255,
    2380,
    2525,
    2720,
    2895,
    2990,
    3165,
    3295,
    3430,
    3645,
    3800,
    3935,
    4030,
    4160,
    4205,
    4465,
]

# Length of E coli chromosome in kb
n = 4642

# Comput min max size
max_cid = n - lioy[-1] + lioy[0]
min_cid = n - lioy[-1] + lioy[0]
prev_cid = lioy[0]
for i in lioy[1:]:
    size = i - prev_cid
    prev_cid = i
    if size > max_cid:
        max_cid = size
    if size < min_cid:
        min_cid = size

print(f"Longest CID- Lioy: {max_cid}kb")
print(f"Shortest CID - Lioy: {min_cid}kb")

# Compute macrodomains on our matrix based on directional index.
di_macro = bcd.directional_index(mat, 80)
plt.subplots(figsize=(12, 1))
plt.fill_between(
    x=np.arange(0, len(di_macro) * 5, 5),
    y1=0,
    y2=di_macro,
    where=di_macro > 0,
    color="g",
)
plt.fill_between(
    x=np.arange(0, len(di_macro) * 5, 5),
    y1=0,
    y2=di_macro,
    where=di_macro <= 0,
    color="r",
)
plt.ylim(-2, 2)
plt.xlim(0, n)
plt.savefig(macro_output)

borders_macro = bcd.di_borders(di_macro)
print("Numbers of CIDs:", len(borders_macro))

# Compute CIDs on our matrix based on directional index.
di_CIDs = bcd.directional_index(mat, 20)
plt.subplots(figsize=(12, 2))
plt.fill_between(
    x=np.arange(0, len(di_CIDs) * 5, 5),
    y1=0,
    y2=di_CIDs,
    where=di_CIDs > 0,
    color="g",
)
plt.fill_between(
    x=np.arange(0, len(di_CIDs) * 5, 5),
    y1=0,
    y2=di_CIDs,
    where=di_CIDs <= 0,
    color="r",
)
plt.ylim(-2, 2)
plt.xlim(0, n)
plt.savefig(CID_output)

borders_CIDs = bcd.di_borders(di_CIDs)
print("Numbers of CIDs:", len(borders_CIDs))

from matplotlib.patches import Patch
from matplotlib.lines import Line2D

# Plot the CID on a matrix at 5kb with their rspectives positions marked with
# stars.
axis = "kb"
start = 0
title = "WT E. coli domains - binning 5kb"
dpi = 300
vmax = 99

fig, ax = plt.subplots(
    3,
    1,
    figsize=(11, 13),
    dpi=dpi,
    gridspec_kw={"height_ratios": [10, 1, 1]},
    sharex=True,
)

# Axis values
scaling_factor = res // 1000
end = n // scaling_factor

# Display plots
im = ax[0].imshow(
    mat.toarray()[start:end, start:end],
    cmap=cmap,
    vmin=0,
    vmax=np.percentile(mat.toarray(), vmax),
    extent=(
        start * scaling_factor,
        end * scaling_factor,
        end * scaling_factor,
        start * scaling_factor,
    ),
)

# Legend
ax[2].set_xlabel(f"Genomic coordinates ({axis:s})", fontsize=16)
ax[0].set_ylabel(f"Genomic coordinates ({axis:s})", fontsize=16)
ax[0].tick_params(axis="both", labelsize=16)
ax[1].tick_params(axis="both", labelsize=16)
ax[2].tick_params(axis="both", labelsize=16)
ax[2].tick_params(axis="x", which="major", pad=15)
ax[1].set_ylabel("DI\n(400kb)", fontsize=16)
ax[2].set_ylabel("DI\n(100kb)", fontsize=16)
ax[0].set_title(title, size=18)
ax[1].set_title("Macrodomains", size=16)
ax[2].set_title("CIDs", size=16)

# Colorbar
cbar = plt.colorbar(im, ax=ax.ravel().tolist(), shrink=0.33, anchor=(0, 0.7))
cbar.ax.tick_params(labelsize=16)

# Add DI plots
ax[1].fill_between(
    x=np.arange(0, len(di_macro) * 5, 5),
    y1=0,
    y2=di_macro,
    where=di_macro > 0,
    color="#33a02c",
)
ax[1].fill_between(
    x=np.arange(0, len(di_macro) * 5, 5),
    y1=0,
    y2=di_macro,
    where=di_macro <= 0,
    color="#e31a1c",
)
ax[1].set_ylim(-2, 2)
ax[1].set_xlim(start, end * scaling_factor)
for i in borders_macro:
    if i > start and i < end:
        ax[1].text(x=(i * 5) - 25, y=-3, s="*", color="k", fontweight="bold")

ax[2].fill_between(
    x=np.arange(0, len(di_CIDs) * 5, 5),
    y1=0,
    y2=di_CIDs,
    where=di_CIDs >= 0,
    color="#33a02c",
    interpolate=True,
)
ax[2].fill_between(
    x=np.arange(0, len(di_CIDs) * 5, 5),
    y1=0,
    y2=di_CIDs,
    where=di_CIDs <= 0,
    color="#e31a1c",
    interpolate=True,
)
ax[2].set_ylim(-2, 2)
ax[2].set_xlim(start * scaling_factor, end * scaling_factor)
for i in borders_CIDs:
    if i > start and i < end:
        ax[0].axvline(x=i * 5, linestyle="dashed", color="k")
        ax[2].text(x=(i * 5) - 25, y=-3, s="*", color="k", fontweight="bold")
for i in [133, 176, 394, 713, 872]:
    if i > start and i < end:
        ax[2].text(
            x=(i * 5) - 25, y=-3, s="*", color="#33a02c", fontweight="bold"
        )
for i in [102, 239, 317, 369, 420, 505, 633, 729, 832]:
    if i > start and i < end:
        ax[2].text(
            x=(i * 5) - 25, y=-3, s="*", color="#e31a1c", fontweight="bold"
        )
# Add legend of stars
legend_elements = [
    Line2D(
        [],
        [],
        color="k",
        marker="*",
        linestyle="None",
        markersize=10,
        label="Conserved borders",
    ),
    Line2D(
        [],
        [],
        color="#33a02c",
        marker="*",
        linestyle="None",
        markersize=10,
        label="New borders",
    ),
    Line2D(
        [],
        [],
        color="#e31a1c",
        marker="*",
        linestyle="None",
        markersize=10,
        label="Missing borders",
    ),
]
ax[2].legend(handles=legend_elements, bbox_to_anchor=(1.3, 1.05))

# Savefig
if out_file is not None:
    plt.savefig(out_file, dpi=dpi)
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
import bacchus.directional as bcd
import bacchus.insulation as bci
import bacchus.io as bcio
import cooler
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.collections as mc
import numpy as np
import os
import pandas as pd
import scipy.sparse as sp
import scipy.stats as st
import seaborn as sns
from random import choices

mat_file = snakemake.input.mat
rna_file = snakemake.input.rna
annotation_file = snakemake.input.annotation
cmap = snakemake.params.cmap
dpi = snakemake.params.dpi
text_file = str(snakemake.output.text_file)
full_mat_file = str(snakemake.output.full_mat)
zoom_1_file = str(snakemake.output.zoom1)
zoom_2_file = str(snakemake.output.zoom2)
full_mat_di_file = str(snakemake.output.full_mat_di)
zoom_1_di_file = str(snakemake.output.zoom1_di)
zoom_2_di_file = str(snakemake.output.zoom2_di)
out_files_mat = [full_mat_file, zoom_1_file, zoom_2_file]
out_files_mat_di = [full_mat_di_file, zoom_1_di_file, zoom_2_di_file]
bor_trans = str(snakemake.output.bor_trans)
trans_bor = str(snakemake.output.trans_bor)

# Create outdir if necessary.
os.makedirs(str(snakemake.params.outdir), exist_ok=True)


def write_stats(text_file, text, writing_type="a"):
    with open(text_file, writing_type) as out:
        out.write(f"{text}\n")


# Compute CIDs at 5kb resolution with 100kb window size.
M5000 = cooler.Cooler(f"{mat_file}::/resolutions/{5000}").matrix(
    balance=True, sparse=True
)[:]
di5000 = bcd.directional_index(M5000, 20)
l5000 = bcd.di_borders(di5000)
write_stats(
    text_file,
    f"Numbers of CIDs at 5kb with 100kb window size: {len(l5000)}",
    "w",
)

# Compute CIDs at 2kb resolution with 50kb window size.
M2000 = cooler.Cooler(f"{mat_file}::/resolutions/{2000}").matrix(
    balance=True, sparse=True
)[:]
di2000 = bcd.directional_index(M2000, 25)
l2000 = bcd.di_borders(di2000)
write_stats(
    text_file, f"Numbers of CIDs at 2kb with 50kb window size: {len(l2000)}"
)

# Compute CIDs at 1kb resolution using insulation score.
M1000 = cooler.Cooler(f"{mat_file}::/resolutions/{1000}").matrix(
    balance=True, sparse=True
)[:]
di1000 = bcd.directional_index(M1000, 20)
l1000 = bcd.di_borders(di1000)
write_stats(
    text_file, f"Numbers of CIDs at 1kb with 50kb window size: {len(l1000)}"
)

mat = cooler.Cooler(f"{mat_file}::/resolutions/{1000}").matrix(
    balance=True, sparse=False
)[:]
mat[np.isnan(mat)] = 0
final_borders_1000, lri_1000 = bci.get_insulation_score(
    mat, [10, 15, 20, 25, 30]
)
write_stats(
    text_file,
    f"{len(final_borders_1000)} CIDs have been detected with insulation score at 1kb resolution.",
)


rna, _ = bcio.extract_big_wig(rna_file, ztransform=False)

annotation = pd.DataFrame(
    columns=["type", "start", "end", "strand", "gene_name", "rpkm"]
)

n = 0
with open(annotation_file, "r") as file:
    for line in file:
        if line.startswith("#"):
            continue
        elif line.startswith(">"):
            break
        else:
            line = line.split("\t")
            if line[2] in ["gene", "tRNA", "rRNA"]:
                if line[2] == "gene":
                    name = line[8].split("Name=")[-1].split(";")[0]
                    annot = {
                        "type": line[2],
                        "start": int(line[3]),
                        "end": int(line[4]),
                        "strand": line[6],
                        "gene_name": name,
                        "rpkm": np.mean(rna[int(line[3]) : int(line[4])]),
                    }
                    annotation = annotation.append(annot, ignore_index=True)


###
# Print matrices with CIDs borders (Fig 1e).
###
starts = [0, 2500, 3000]
ends = [len(mat), 3000, 3500]
col = ["blue", "lime", "cyan"]

for i in range(3):
    out_file, start, end = out_files_mat[i], starts[i], ends[i]

    vmax = 99
    title = None
    fig, ax = plt.subplots(1, 1, figsize=(7, 7), dpi=dpi)

    # Axis values
    scaling_factor = 1.0
    axis = "kb"

    # No end values given.
    if end == 0:
        end = len(mat)

    # Display plots
    im = ax.imshow(
        mat[start:end, start:end],
        cmap=cmap,
        vmin=0,
        vmax=np.nanpercentile(mat, vmax),
        extent=(
            start * scaling_factor,
            end * scaling_factor,
            end * scaling_factor,
            start * scaling_factor,
        ),
    )

    # Legend
    ax.set_xlabel(f"Genomic coordinates ({axis:s})", fontsize=16)
    ax.set_ylabel(f"Genomic coordinates ({axis:s})", fontsize=16)
    ax.tick_params(axis="both", labelsize=16)

    # Title
    if title is not None:
        ax.set_title(title, size=18)

    # Colorbar
    cbar = plt.colorbar(im, shrink=0.33, anchor=(0, 0.5))
    cbar.ax.tick_params(labelsize=16)

    first = True
    for i in [x * 5 for x in l5000]:
        if i in np.arange(start, end):
            #         plt.axvline(i, lw=1, ls='dashed', c='blue')
            if first:
                s = i
                first = False
                ax.axvline(
                    s,
                    ymin=(end - s) / (end - start),
                    ymax=1,
                    c=col[0],
                    lw=1,
                    zorder=0.5,
                )
                ax.axhline(
                    s,
                    xmin=0,
                    xmax=1 - ((end - s) / (end - start)),
                    c=col[0],
                    lw=1,
                    zorder=0.5,
                )
            else:
                e = i
                ax.add_patch(
                    patches.Rectangle(
                        (s, s), e - s, e - s, edgecolor=col[0], fill=False
                    )
                )
                s = i
    ax.axvline(
        s, ymin=(end - s) / (end - start), ymax=0, c=col[0], lw=1, zorder=0.5
    )
    ax.axhline(
        s,
        xmin=1,
        xmax=1 - ((end - s) / (end - start)),
        c=col[0],
        lw=1,
        zorder=0.5,
    )

    first = True
    for i in [x * 5 for x in l2000]:
        if i in np.arange(start, end):
            #         plt.axvline(i, lw=1, ls='dashed', c='cyan')
            if first:
                s = i
                first = False
                ax.axvline(
                    s,
                    ymin=(end - s) / (end - start),
                    ymax=1,
                    c=col[1],
                    lw=1,
                    zorder=0.5,
                )
                ax.axhline(
                    s,
                    xmin=0,
                    xmax=1 - ((end - s) / (end - start)),
                    c=col[1],
                    lw=1,
                    zorder=0.5,
                )
            else:
                e = i
                ax.add_patch(
                    patches.Rectangle(
                        (s, s), e - s, e - s, edgecolor=col[1], fill=False
                    )
                )
                s = i
    ax.axvline(
        s, ymin=(end - s) / (end - start), ymax=0, c=col[1], lw=1, zorder=0.5
    )
    ax.axhline(
        s,
        xmin=1,
        xmax=1 - ((end - s) / (end - start)),
        c=col[1],
        lw=1,
        zorder=0.5,
    )

    first = True
    for i in final_borders_1000:
        if i in np.arange(start, end):
            #         plt.axvline(i, lw=1, ls='dashed', c='magenta')
            if first:
                s = i
                first = False
                ax.axvline(
                    s,
                    ymin=(end - s) / (end - start),
                    ymax=1,
                    c=col[2],
                    lw=1,
                    zorder=0.5,
                )
                ax.axhline(
                    s,
                    xmin=0,
                    xmax=1 - ((end - s) / (end - start)),
                    c=col[2],
                    lw=1,
                    zorder=0.5,
                )
            else:
                e = i
                ax.add_patch(
                    patches.Rectangle(
                        (s, s), e - s, e - s, edgecolor=col[2], fill=False
                    )
                )
                s = i
    ax.axvline(
        s, ymin=(end - s) / (end - start), ymax=0, c=col[2], lw=1, zorder=0.5
    )
    ax.axhline(
        s,
        xmin=1,
        xmax=1 - ((end - s) / (end - start)),
        c=col[2],
        lw=1,
        zorder=0.5,
    )

    # Savefig
    plt.savefig(out_file, dpi=dpi, bbox_inches="tight")
    plt.close()


###
# Print full matrices with CIDs borders using DI only (Supp Fig 1f).
###

for i in range(3):
    out_file, start, end = out_files_mat_di[i], starts[i], ends[i]

    vmax = 99
    title = None
    fig, ax = plt.subplots(1, 1, figsize=(7, 7), dpi=dpi)

    # Axis values
    scaling_factor = 1.0
    axis = "kb"

    # No end values given.
    if end == 0:
        end = len(mat)

    # Display plots
    im = ax.imshow(
        mat[start:end, start:end],
        cmap=cmap,
        vmin=0,
        vmax=np.percentile(mat, vmax),
        extent=(
            start * scaling_factor,
            end * scaling_factor,
            end * scaling_factor,
            start * scaling_factor,
        ),
    )

    # Legend
    ax.set_xlabel(f"Genomic coordinates ({axis:s})", fontsize=16)
    ax.set_ylabel(f"Genomic coordinates ({axis:s})", fontsize=16)
    ax.tick_params(axis="both", labelsize=16)

    # Title
    if title is not None:
        ax.set_title(title, size=18)

    # Colorbar
    cbar = plt.colorbar(im, shrink=0.33, anchor=(0, 0.5))
    cbar.ax.tick_params(labelsize=16)

    first = True
    for i in [x * 5 for x in l5000]:
        if i in np.arange(start, end):
            #         plt.axvline(i, lw=1, ls='dashed', c='blue')
            if first:
                s = i
                first = False
                ax.axvline(
                    s,
                    ymin=(end - s) / (end - start),
                    ymax=1,
                    c=col[0],
                    lw=1,
                    zorder=0.5,
                )
                ax.axhline(
                    s,
                    xmin=0,
                    xmax=1 - ((end - s) / (end - start)),
                    c=col[0],
                    lw=1,
                    zorder=0.5,
                )
            else:
                e = i
                ax.add_patch(
                    patches.Rectangle(
                        (s, s), e - s, e - s, edgecolor=col[0], fill=False
                    )
                )
                s = i
    ax.axvline(
        s, ymin=(end - s) / (end - start), ymax=0, c=col[0], lw=1, zorder=0.5
    )
    ax.axhline(
        s,
        xmin=1,
        xmax=1 - ((end - s) / (end - start)),
        c=col[0],
        lw=1,
        zorder=0.5,
    )

    first = True
    for i in [x * 5 for x in l2000]:
        if i in np.arange(start, end):
            # plt.axvline(i, lw=1, ls='dashed', c='cyan')
            if first:
                s = i
                first = False
                ax.axvline(
                    s,
                    ymin=(end - s) / (end - start),
                    ymax=1,
                    c=col[1],
                    lw=1,
                    zorder=0.5,
                )
                ax.axhline(
                    s,
                    xmin=0,
                    xmax=1 - ((end - s) / (end - start)),
                    c=col[1],
                    lw=1,
                    zorder=0.5,
                )
            else:
                e = i
                ax.add_patch(
                    patches.Rectangle(
                        (s, s), e - s, e - s, edgecolor=col[1], fill=False
                    )
                )
                s = i
    ax.axvline(
        s, ymin=(end - s) / (end - start), ymax=0, c=col[1], lw=1, zorder=0.5
    )
    ax.axhline(
        s,
        xmin=1,
        xmax=1 - ((end - s) / (end - start)),
        c=col[1],
        lw=1,
        zorder=0.5,
    )

    first = True
    for i in [x * 5 for x in l1000]:
        if i in np.arange(start, end):
            # plt.axvline(i, lw=1, ls='dashed', c='cyan')
            if first:
                s = i
                first = False
                ax.axvline(
                    s,
                    ymin=(end - s) / (end - start),
                    ymax=1,
                    c=col[2],
                    lw=1,
                    zorder=0.5,
                )
                ax.axhline(
                    s,
                    xmin=0,
                    xmax=1 - ((end - s) / (end - start)),
                    c=col[2],
                    lw=1,
                    zorder=0.5,
                )
            else:
                e = i
                ax.add_patch(
                    patches.Rectangle(
                        (s, s), e - s, e - s, edgecolor=col[2], fill=False
                    )
                )
                s = i
    ax.axvline(
        s, ymin=(end - s) / (end - start), ymax=0, c=col[2], lw=1, zorder=0.5
    )
    ax.axhline(
        s,
        xmin=1,
        xmax=1 - ((end - s) / (end - start)),
        c=col[2],
        lw=1,
        zorder=0.5,
    )

    # Savefig
    plt.savefig(out_file, dpi=dpi, bbox_inches="tight")
    plt.close()

###
# Transcription at borders.
###


def get_borders_transcriptions(annotation, borders, size, step):
    borders_transcriptions = []
    for i in annotation.index:
        if annotation.loc[i, "strand"] == "-":
            pos = annotation.loc[i, "end"]
        else:
            pos = annotation.loc[i, "start"]
        for j in borders:
            a = j * size + step
            if abs(pos - a) < 2500:
                borders_transcriptions.append(annotation.loc[i, "rpkm"])
    return borders_transcriptions


borders_trans_5000 = get_borders_transcriptions(annotation, l5000, 5000, 2500)
borders_trans_2000 = get_borders_transcriptions(annotation, l2000, 5000, 1000)
borders_trans_1000 = get_borders_transcriptions(
    annotation, final_borders_1000, 1000, 500
)
data = {
    "RPKM (log)": np.log(
        list(annotation.rpkm)
        + list(np.log(borders_trans_5000))
        + list(np.log(borders_trans_2000))
        + list(np.log(borders_trans_1000))
    ),
    "Genes": list(np.repeat(f"All genes\nn={len(annotation)}", len(annotation)))
    + list(
        np.repeat(f"5kb\nn={len(borders_trans_5000)}", len(borders_trans_5000))
    )
    + list(
        np.repeat(f"2kb\nn={len(borders_trans_2000)}", len(borders_trans_2000))
    )
    + list(
        np.repeat(f"1kb\nn={len(borders_trans_1000)}", len(borders_trans_1000))
    ),
}
data = pd.DataFrame(data)
data.replace([np.inf, -np.inf, np.nan], 0, inplace=True)

sns.violinplot(x="Genes", y="RPKM (log)", data=data, palette="tab10")
plt.xlabel("Resolution of the matrix to detect the borders", size=14)
plt.ylabel("RPKM (log)", size=14)
plt.title(
    "Genes transcription of genes at less\nthan 5kb of a detected border",
    size=16,
)
plt.xticks(size=12)
plt.yticks(size=12)
plt.savefig(bor_trans, bbox_inches="tight")
plt.close()

write_stats(
    text_file,
    f"p-value between 5kb borders and whole genome: {st.mannwhitneyu(np.log(list(annotation.rpkm)), np.log(borders_trans_5000))[1]}",
)
write_stats(
    text_file,
    f"p-value between 2kb borders and whole genome: {st.mannwhitneyu(np.log(list(annotation.rpkm)), np.log(borders_trans_2000))[1]}",
)
write_stats(
    text_file,
    f"p-value between 1kb borders and whole genome: {st.mannwhitneyu(np.log(list(annotation.rpkm)), np.log(borders_trans_1000))[1]}",
)

##
# Transcription depending on distance borders
###

annotation["dist_5kb"] = 0
annotation["dist_2kb"] = 0
annotation["dist_1kb"] = 0
annotation["log1p_rpkm"] = np.log1p(annotation["rpkm"])
for i in annotation.index:
    value = np.inf
    if annotation.loc[i, "strand"] == "+":
        for x in l5000:
            pos = annotation.loc[i, "start"]
            if pos < x * 5000:
                a = x * 5000 - pos
            elif pos > (x + 1) * 5000:
                a = pos - (x + 1) * 5000
            else:
                a = -1
            value = min(a, value)
    else:
        for x in l5000:
            pos = annotation.loc[i, "end"]
            if pos < x * 5000:
                a = x * 5000 - pos
            elif pos > (x + 1) * 5000:
                a = pos - (x + 1) * 5000
            else:
                a = -1
            value = min(a, value)
    annotation.loc[i, "dist_5kb"] = value

for i in annotation.index:
    value = np.inf
    if annotation.loc[i, "strand"] == "+":
        for x in l2000:
            pos = annotation.loc[i, "start"]
            if pos < x * 5000:
                a = x * 5000 - pos
            elif pos > (x + 0.4) * 5000:
                a = pos - (x + 0.4) * 5000
            else:
                a = -1
            value = min(a, value)
    else:
        for x in l2000:
            pos = annotation.loc[i, "end"]
            if pos < x * 5000:
                a = x * 5000 - pos
            elif pos > (x + 0.4) * 5000:
                a = pos - (x + 0.4) * 5000
            else:
                a = -1
            value = min(a, value)
    annotation.loc[i, "dist_2kb"] = value

for i in annotation.index:
    value = np.inf
    if annotation.loc[i, "strand"] == "+":
        for x in final_borders_1000:
            pos = annotation.loc[i, "start"]
            if pos < x * 1000:
                a = x * 1000 - pos
            elif pos > (x + 1) * 1000:
                a = pos - (x + 1) * 1000
            else:
                a = -1
            value = min(a, value)
    else:
        for x in final_borders_1000:
            pos = annotation.loc[i, "end"]
            if pos < x * 1000:
                a = x * 1000 - pos
            elif pos > (x + 1) * 1000:
                a = pos - (x + 1) * 1000
            else:
                a = -1
            value = min(a, value)
    annotation.loc[i, "dist_1kb"] = value

bin_means_l3, bin_edges_l3, binnumber_l3 = st.binned_statistic(
    annotation.dist_1kb,
    annotation.rpkm,
    statistic="mean",
    bins=41,
    range=(-5000, 200000),
)
bin_means_l2, bin_edges_l2, binnumber_l2 = st.binned_statistic(
    annotation.dist_2kb,
    annotation.rpkm,
    statistic="mean",
    bins=41,
    range=(-5000, 200000),
)
bin_means_l1, bin_edges_l1, binnumber_l1 = st.binned_statistic(
    annotation.dist_5kb,
    annotation.rpkm,
    statistic="mean",
    bins=41,
    range=(-5000, 200000),
)


def bootstrap_down(data):
    l = np.zeros((10000))
    for i in range(10000):
        l[i] = np.mean(choices(data, k=len(data)))
    return np.percentile(l, 5)


def bootstrap_up(data):
    l = np.zeros((10000))
    for i in range(10000):
        l[i] = np.mean(choices(data, k=len(data)))
    return np.percentile(l, 95)


bin_perc5_l1, bin_edges_l1, binnumber_l1 = st.binned_statistic(
    annotation.dist_5kb,
    annotation.rpkm,
    statistic=bootstrap_down,
    bins=41,
    range=(-5000, 200000),
)
bin_perc95_l1, bin_edges_l1, binnumber_l1 = st.binned_statistic(
    annotation.dist_5kb,
    annotation.rpkm,
    statistic=bootstrap_up,
    bins=41,
    range=(-5000, 200000),
)
bin_perc5_l2, bin_edges_l2, binnumber_l2 = st.binned_statistic(
    annotation.dist_2kb,
    annotation.rpkm,
    statistic=bootstrap_down,
    bins=41,
    range=(-5000, 200000),
)
bin_perc95_l2, bin_edges_l2, binnumber_l2 = st.binned_statistic(
    annotation.dist_2kb,
    annotation.rpkm,
    statistic=bootstrap_up,
    bins=41,
    range=(-5000, 200000),
)
bin_perc5_l3, bin_edges_l3, binnumber_l3 = st.binned_statistic(
    annotation.dist_1kb,
    annotation.rpkm,
    statistic=bootstrap_down,
    bins=41,
    range=(-5000, 200000),
)
bin_perc95_l3, bin_edges_l3, binnumber_l3 = st.binned_statistic(
    annotation.dist_1kb,
    annotation.rpkm,
    statistic=bootstrap_up,
    bins=41,
    range=(-5000, 200000),
)

plt.plot(bin_edges_l1[1:] / 1000, bin_means_l1, label="5kb")
plt.fill_between(
    bin_edges_l1[1:] / 1000, bin_perc5_l1, bin_perc95_l1, alpha=0.5
)
plt.plot(bin_edges_l2[1:] / 1000, bin_means_l2, label="2kb")
plt.fill_between(
    bin_edges_l2[1:] / 1000, bin_perc5_l2, bin_perc95_l2, alpha=0.5
)
plt.plot(bin_edges_l3[1:] / 1000, bin_means_l3, label="1kb")
plt.fill_between(
    bin_edges_l3[1:] / 1000, bin_perc5_l3, bin_perc95_l3, alpha=0.5
)
plt.xlabel("Distance from the closest border (kb)", size=14)
plt.ylabel("Mean RPKM", size=14)
plt.xlim(0, 50)
plt.title(
    "Genes transcription depending on\nthe distance to the closest border",
    size=16,
)
plt.xticks(size=12)
plt.yticks(size=12)
plt.legend()
plt.savefig(trans_bor, bbox_inches="tight")
plt.close()
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import bacchus.io as bcio
import bacchus.plot as bcp
import bacchus.hic as bch
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st
import os
from os.path import dirname

# Make sure that the figures directory exists.
os.makedirs(dirname(snakemake.output.mat_fig), exist_ok=True)
os.makedirs(dirname(snakemake.output.corr_fig), exist_ok=True)

# Import parameters
binning = int(snakemake.params.binning)
label = str(snakemake.params.species)
circular = str(snakemake.params.circular)

# Import contact map as a dense matrix.
M = bcio.build_map(
    matrix_files=[str(snakemake.input.hic)],
    fragment_file="none",
    bin_size=binning,
    mat_format="cool",
    normalize=True,
    subsample=0,
)

# Plot the contact map.
bcp.contact_map(
    M,
    axis="Mb",
    binning=binning,
    cmap="Reds",
    dpi=500,
    out_file=str(snakemake.output.mat_fig),
    title=label,
    vmax=99,
)

# Import RNA tracks.
rna, chrom_start = bcio.extract_big_wig(
    file=str(snakemake.input.rna),
    binning=binning,
    circular=circular,
    sigma=None,
    ztransform=None,
)
# Ugly loop to have the chrom length...
chrom_start_size = {}
for i, name in enumerate(chrom_start):
    if i != 0:
        chrom_start_size[prev_name] = {
            "start": start,
            "length": chrom_start[name] - start,
        }
    prev_name = name
    start = chrom_start[name]
chrom_start_size[prev_name] = {
    "start": start,
    "length": chrom_start[name] - start,
}

# Compute HiC signal.
hic = bch.compute_hic_signal(M, binning=binning, start=5000, stop=10000)
print(st.spearmanr(hic, rna[:-2])[0])

# Plot correlation between HiC signal and RNAseq.
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
ax.scatter(hic, np.log(rna[:-2]))
ax.set_xlabel("HiC signal")
ax.set_ylabel("Transcription (log)")
ax.set_title(label)
ax.text(
    x=np.nanpercentile(hic, 99),
    y=np.nanpercentile(np.log(rna), 1),
    s=f"corr={st.spearmanr(hic, rna[:-2])[0]:.2f}",
)
plt.savefig(snakemake.output.corr_fig, dpi=100)
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
import bacchus.hic as bch
import bacchus.io as bcio
import cooler
import matplotlib.pyplot as plt
import numpy as np
import os
import scipy.stats as st

# Import snakemake.
mat_t7 = snakemake.input.mat_t7
mat_t7_2p100 = snakemake.input.mat_t7_2p100
mat_t7_2p60 = snakemake.input.mat_t7_2p60
mat_t7_2p100_dv = snakemake.input.mat_t7_2p100_dv
mat_t7_2p60_dv = snakemake.input.mat_t7_2p60_dv
mat_t7_2p100_cv = snakemake.input.mat_t7_2p100_cv
rna_t7 = snakemake.input.rna_t7
rna_t7_2p100 = snakemake.input.rna_t7_2p100
rna_t7_2p60 = snakemake.input.rna_t7_2p60
rna_t7_2p100_dv = snakemake.input.rna_t7_2p100_dv
rna_t7_2p60_dv = snakemake.input.rna_t7_2p60_dv
rna_t7_2p100_cv = snakemake.input.rna_t7_2p100_cv
chip_RNA_t7 = snakemake.input.chip_RNA_t7
chip_RNA_t7_2p100 = snakemake.input.chip_RNA_t7_2p100
chip_RNA_t7_2p60 = snakemake.input.chip_RNA_t7_2p60
chip_RNA_t7_2p100_dv = snakemake.input.chip_RNA_t7_2p100_dv
chip_RNA_t7_2p60_dv = snakemake.input.chip_RNA_t7_2p60_dv
chip_RNA_t7_2p100_cv = snakemake.input.chip_RNA_t7_2p100_cv
chip_gapr_t7 = snakemake.input.chip_gapr_t7
chip_gapr_t7_2p100 = snakemake.input.chip_gapr_t7_2p100
chip_gapr_t7_2p60 = snakemake.input.chip_gapr_t7_2p60
chip_gapr_t7_2p100_dv = snakemake.input.chip_gapr_t7_2p100_dv
chip_gapr_t7_2p60_dv = snakemake.input.chip_gapr_t7_2p60_dv
chip_gapr_t7_2p100_cv = snakemake.input.chip_gapr_t7_2p100_cv
chip_gapr_control = snakemake.input.chip_gapr_control
cmap = snakemake.params.cmap
res = snakemake.params.res
tracks_res = snakemake.params.tracks_res 
outdir = snakemake.params.outdir 
out_plot = str(snakemake.output.plot)
out_txt = str(snakemake.output.txt)

# Make sure output directory exists.
os.makedirs(outdir, exist_ok=True)

# Import matrices
mat_t7 = cooler.Cooler(f"{mat_t7}::/resolutions/{res}").matrix(
    balance=True, sparse=False
)[:]
mat_t7[np.isnan(mat_t7)] = 0
mat_t7_2p100 = cooler.Cooler(f"{mat_t7_2p100}::/resolutions/{res}").matrix(
    balance=True, sparse=False
)[:]
mat_t7_2p100[np.isnan(mat_t7_2p100)] = 0
mat_t7_2p60 = cooler.Cooler(f"{mat_t7_2p60}::/resolutions/{res}").matrix(
    balance=True, sparse=False
)[:]
mat_t7_2p60[np.isnan(mat_t7_2p60)] = 0
mat_t7_2p100_dv = cooler.Cooler(f"{mat_t7_2p100_dv}::/resolutions/{res}").matrix(
    balance=True, sparse=False
)[:]
mat_t7_2p100_dv[np.isnan(mat_t7_2p100_dv)] = 0
mat_t7_2p60_dv = cooler.Cooler(f"{mat_t7_2p60_dv}::/resolutions/{res}").matrix(
    balance=True, sparse=False
)[:]
mat_t7_2p60_dv[np.isnan(mat_t7_2p60_dv)] = 0
mat_t7_2p100_cv = cooler.Cooler(f"{mat_t7_2p100_cv}::/resolutions/{res}").matrix(
    balance=True, sparse=False
)[:]
mat_t7_2p100_cv[np.isnan(mat_t7_2p100_cv)] = 0

# Import RNA tracks
rna_t7, _ = bcio.extract_big_wig(rna_t7, tracks_res)
rna_t7_2p100, _ = bcio.extract_big_wig(rna_t7_2p100, tracks_res)
rna_t7_2p60, _ = bcio.extract_big_wig(rna_t7_2p60, tracks_res)
rna_t7_2p100_dv, _ = bcio.extract_big_wig(rna_t7_2p100_dv, tracks_res)
rna_t7_2p60_dv, _ = bcio.extract_big_wig(rna_t7_2p60_dv, tracks_res)
rna_t7_2p100_cv, _ = bcio.extract_big_wig(rna_t7_2p100_cv, tracks_res)

# Import chip RNA pol T7
chip_RNA_t7, _ = bcio.extract_big_wig(chip_RNA_t7, res)
chip_RNA_t7_2p100, _ = bcio.extract_big_wig(chip_RNA_t7_2p100, res)
chip_RNA_t7_2p60, _ = bcio.extract_big_wig(chip_RNA_t7_2p60, res)
chip_RNA_t7_2p100_dv, _ = bcio.extract_big_wig(chip_RNA_t7_2p100_dv, res)
chip_RNA_t7_2p60_dv, _ = bcio.extract_big_wig(chip_RNA_t7_2p60_dv, res)
chip_RNA_t7_2p100_cv, _ = bcio.extract_big_wig(chip_RNA_t7_2p100_cv, res)

# Import chip GapR
chip_gapr_t7, _ = bcio.extract_big_wig(chip_gapr_t7, res)
chip_gapr_t7_2p100, _ = bcio.extract_big_wig(chip_gapr_t7_2p100, res)
chip_gapr_t7_2p60, _ = bcio.extract_big_wig(chip_gapr_t7_2p60, res)
chip_gapr_t7_2p100_dv, _ = bcio.extract_big_wig(chip_gapr_t7_2p100_dv, res)
chip_gapr_t7_2p60_dv, _ = bcio.extract_big_wig(chip_gapr_t7_2p60_dv, res)
chip_gapr_t7_2p100_cv, _ = bcio.extract_big_wig(chip_gapr_t7_2p100_cv, res)

# Control GapR
chip_gapr_control, _ = bcio.extract_big_wig(chip_gapr_control, res)

# Compute the log fold change between signal
chip_gapr_t7 = np.log2(chip_gapr_t7) - np.log2(chip_gapr_control) 
chip_gapr_t7_2p100 = np.log2(chip_gapr_t7_2p100) - np.log2(chip_gapr_control)
chip_gapr_t7_2p60 = np.log2(chip_gapr_t7_2p60) - np.log2(chip_gapr_control)
chip_gapr_t7_2p100_dv = np.log2(chip_gapr_t7_2p100_dv) - np.log2(chip_gapr_control)
chip_gapr_t7_2p60_dv = np.log2(chip_gapr_t7_2p60_dv) - np.log2(chip_gapr_control)
chip_gapr_t7_2p100_cv = np.log2(chip_gapr_t7_2p100_cv) - np.log2(chip_gapr_control) 

# Compute HiC signal
hic_t7 = bch.compute_hic_signal(mat_t7, 1000, 0, 5000)
hic_t7_2p100 = bch.compute_hic_signal(mat_t7_2p100, 1000, 0, 5000)
hic_t7_2p60 = bch.compute_hic_signal(mat_t7_2p60, 1000, 0, 5000)
hic_t7_2p100_dv = bch.compute_hic_signal(mat_t7_2p100_dv, 1000, 0, 5000)
hic_t7_2p60_dv = bch.compute_hic_signal(mat_t7_2p60_dv, 1000, 0, 5000)
hic_t7_2p100_cv = bch.compute_hic_signal(mat_t7_2p100_cv, 1000, 0, 5000)

list_rna = [
    rna_t7,
    rna_t7_2p100,
    rna_t7_2p60,
    rna_t7_2p100_dv,
    rna_t7_2p60_dv,
    rna_t7_2p100_cv,
]
list_chip_T7 = [
    chip_RNA_t7,
    chip_RNA_t7_2p100,
    chip_RNA_t7_2p60,
    chip_RNA_t7_2p100_dv,
    chip_RNA_t7_2p60_dv,
    chip_RNA_t7_2p100_cv,
]
list_chip_gapr = [
    chip_gapr_t7,
    chip_gapr_t7_2p100,
    chip_gapr_t7_2p60,
    chip_gapr_t7_2p100_dv,
    chip_gapr_t7_2p60_dv,
    chip_gapr_t7_2p100_cv,
]
list_hic_signal = [
    hic_t7,
    hic_t7_2p100,
    hic_t7_2p60,
    hic_t7_2p100_dv,
    hic_t7_2p60_dv,
    hic_t7_2p100_cv,
]
list_hic = [
    mat_t7,
    mat_t7_2p100,
    mat_t7_2p60,
    mat_t7_2p100_dv,
    mat_t7_2p60_dv,
    mat_t7_2p100_cv,
]

def z_transform(values):
    return (values - np.nanmean(values) / np.nanstd(values))

# for i in range(6):
#     list_rna[i] = z_transform(list_rna[i])
#     list_chip_T7[i] = z_transform(list_chip_T7[i])
#     list_chip_gapr[i] = z_transform(list_chip_gapr[i])
#     list_hic_signal[i] = z_transform(list_hic_signal[i])

# Make the plots 
start, end = 0, 750
binning = 10

fig, ax = plt.subplots(4,6 , figsize=(20,15), sharex=True, gridspec_kw={'height_ratios': [8, 4, 4, 4]})

# Define max values for x_lim
max_rna = 15.5
min_gapr = -4

# Define color
col = ["k", "#fdbf6f", "#1f78b4", '#e31a1c']

for i in range(6):
    # Hicmap
    im = ax[0, i].imshow(
        list_hic[i][start:end, start:end],
        cmap="Reds",
        vmax=np.nanpercentile(list_hic[0], 99.9),
        extent=(start, end, end, start),
    )
    ax[0, i].get_xaxis().set_visible(False)
    ax[0, i].tick_params(axis='both', labelsize=16)
    # ax[0, i].set_ylabel("Genomic coordinates (kb)", size=16)

    # Settings plot 1, 2
    # ax[1, i].set_xlabel("Genomic coordinates (kb)", size=16)
    ax[1, 0].set_ylabel("Transcription (CPM)", size=16)
    ax[1, i].spines['top'].set_visible(False)
    ax[1, i].spines['right'].set_visible(False)
    ax[1, i].set_ylim(-1, 3000)
    ax[1, i].tick_params(axis='both', labelsize=16, color=col[0], labelcolor=col[0])
    # ax[2, i].set_xlabel("Genomic coordinates (kb)", size=16)
    ax[2, 0].set_ylabel("T7 RNA pol signal (CPM)", size=16)
    ax[2, i].spines['top'].set_visible(False)
    ax[2, i].spines['right'].set_visible(False)
    ax[2, i].set_ylim(-0.05, 1)
    ax[2, i].tick_params(axis='both', labelsize=16, color=col[0], labelcolor=col[0])
    ax[3, i].set_xlabel("Genomic coordinates (kb)", size=16)
    ax[3, 0].set_ylabel("GapR signal (log2 fold change)", size=16)
    ax[3, i].spines['top'].set_visible(False)
    ax[3, i].spines['right'].set_visible(False)
    ax[3, i].set_ylim(-0.05, 1)
    ax[3, i].tick_params(axis='both', labelsize=16, color=col[0], labelcolor=col[0])

    # RNAseq
    if list_rna[i].all() != None:
        p1 = ax[1, i].fill_between(
            np.arange(start, end, 1 / binning),
            list_rna[i][int(start * binning):int(end * binning)],
            color=col[0],
            label="RNAseq",
            linewidth=1,
        )

    # HiC signal
    p2 = ax[2, i].plot(
        np.arange(start, end, 1),
        list_hic_signal[i][start:end],
        color=col[3],
        label="HiC signal",
        linewidth=1,
        alpha=.7,
    )
    p2 = ax[3, i].plot(
        np.arange(start, end, 1),
        list_hic_signal[i][start:end],
        color=col[3],
        label="HiC signal",
        linewidth=1,
        alpha=.7,
    )

    # T7 ChIPseq
    ax3 = ax[2, i].twinx()
    if list_chip_T7[i].all() != None:
        p3 = ax3.plot(
            np.arange(start, end, 1),
            list_chip_T7[i][start:end],
            color=col[2],
            label="T7 RNA pol ChIPseq",
            alpha=.7,
            linewidth=1,
        )
        ax3.set_ylim(-100, 6000)

    # GapR ChIPseq
    ax4 = ax[3, i].twinx()
    if list_chip_gapr[i].all() != None:
        p4 = ax4.plot(
            np.arange(start, end, 1),
            list_chip_gapr[i][start:end],
            color=col[1],
            label="GapR ChIPseq",
            alpha=1,
            linewidth=1,
        )
        # ax[3, i].set_ylim(-7.5, 10)
        ax4.set_ylim(-1.5, 1.5)

# Colorbar
cbar = fig.colorbar(im, ax=ax.ravel().tolist(), shrink=.2, anchor=(1.2, .89))
cbar.ax.tick_params(labelsize=16)

# Legend
leg = fig.legend(
    labels=["RNAseq", "GapR ChIPseq", "T7 RNA pol ChIPseq", "HiC signal"],
    loc=(0.9, 0.25),
)
for i in range(4):
    leg.legendHandles[i].set_color(col[i])

# Adjust space between plot
plt.subplots_adjust(wspace=.4, hspace=.2, left=0.05, right=0.88)
plt.savefig(out_plot, dpi=250)


# Correlation between tracks
start=220 #kb
end=580 #kb

def get_binning(values, size=10):
    n = len(values) // size
    new_values = np.zeros(n + 1)
    for i in range(n):
        new_values[i] = np.nanmean(values[i * size:(i + 1) * size])
    new_values[n] = np.nanmean(values[n * size:])
    new_values
    return new_values

for i in range(6):
    list_rna[i] = get_binning(list_rna[i])
    # list_chip_T7[i] = get_binning(list_chip_T7[i])
    # list_chip_gapr[i] = get_binning(list_chip_gapr[i])


with open(out_txt, 'w') as out:
    out.write('Correlation between signal:\n')
    label=["T7", "T7_CL_100", "T7_CL_60", "T7_DV_100", "T7_DV_60", "T7_CV"]
    for i in range(6):
    #     out.write(f"{label[i]};RNA/T7 Pearson correlation: {st.pearsonr(list_rna[i][start:end], list_chip_T7[i][start:end])[0]:.2f}\n")
    #     out.write(f"{label[i]};RNA/GapR Pearson correlation: {st.pearsonr(list_rna[i][start:end], list_chip_gapr[i][start:end])[0]:.2f}\n")
    #     out.write(f"{label[i]};RNA/HiC Pearson correlation: {st.pearsonr(list_rna[i][start:end], list_hic_signal[i][start:end])[0]:.2f}\n")
    #     out.write(f"{label[i]};T7/GapR Pearson correlation: {st.pearsonr(list_chip_T7[i][start:end], list_chip_gapr[i][start:end])[0]:.2f}\n")
    #     out.write(f"{label[i]};T7/HiC Pearson correlation: {st.pearsonr(list_chip_T7[i][start:end], list_hic_signal[i][start:end])[0]:.2f}\n")
    #     out.write(f"{label[i]};GapR/HiC Pearson correlation: {st.pearsonr(list_chip_gapr[i][start:end], list_hic_signal[i][start:end])[0]:.2f}\n")
        out.write(f"{label[i]};RNA/T7 Spearmann correlation: {st.spearmanr(list_rna[i][start:end], list_chip_T7[i][start:end])[0]:.2f}\n")    
        out.write(f"{label[i]};RNA/GapR Spearmann correlation: {st.spearmanr(list_rna[i][start:end], list_chip_gapr[i][start:end])[0]:.2f}\n")
        out.write(f"{label[i]};RNA/HiC Spearmann correlation: {st.spearmanr(list_rna[i][start:end], list_hic_signal[i][start:end])[0]:.2f}\n")
        out.write(f"{label[i]};T7/GapR Spearmann correlation: {st.spearmanr(list_chip_T7[i][start:end], list_chip_gapr[i][start:end])[0]:.2f}\n")
        out.write(f"{label[i]};T7/HiC Spearmann correlation: {st.spearmanr(list_chip_T7[i][start:end], list_hic_signal[i][start:end])[0]:.2f}\n")  
        out.write(f"{label[i]};GapR/HiC Spearmann correlation: {st.spearmanr(list_chip_gapr[i][start:end], list_hic_signal[i][start:end])[0]:.2f}\n")
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import bacchus.hic as bch
import cooler
import hicstuff.hicstuff as hcs
import matplotlib.pyplot as plt
import numpy as np
import copy 
import os
from scipy import ndimage
import serpentine as srp

# Import snakemake.
mat_t7_ara_file = snakemake.input.mat_ara
mat_t7_novo_file = snakemake.input.mat_novo
mat_t7_ara_rif_file = snakemake.input.mat_ara_rif
mat_t7_novo_rif_file = snakemake.input.mat_novo_rif
cmap = snakemake.params.cmap
res = snakemake.params.res
res_ratio = snakemake.params.res_ratio
width = snakemake.params.width
cpus = snakemake.threads

outfile = str(snakemake.output.matrices)
outfile_ratio = str(snakemake.output.ratio)
outfile_ratio_rif = str(snakemake.output.ratio_rif)
# out_zoom = str(snakemake.output.zoom)
out_zoom_ratio = str(snakemake.output.zoom_ratio)

# Make sure output directory exists.
os.makedirs(str((snakemake.params.outdir)), exist_ok=True)

# Import matrices
subsample = 11675151
files = [mat_t7_ara_file, mat_t7_novo_file, mat_t7_ara_rif_file, mat_t7_novo_rif_file]
hic = [0, 0, 0, 0]
for i, file in enumerate(files):
    clr = cooler.Cooler(f"{file}::/resolutions/{res}")
    mat = clr.matrix(balance=False, sparse=True)[:, :]
    mat = hcs.subsample_contacts(mat, subsample)
    mat = hcs.normalize_sparse(mat, norm='ICE', iterations=100, n_mad=10)
    hic[i] = mat.toarray()

# Ratio
subsample = 10251593
hic_ratio = [0, 0, 0, 0]
for i, file in enumerate(files):
    clr = cooler.Cooler(f"{file}::/resolutions/{res_ratio}")
    mat = clr.matrix(balance=False, sparse=True)[:, :]
    print(mat.sum())
    mat = hcs.subsample_contacts(mat, subsample)
    mat = hcs.normalize_sparse(mat, norm='ICE', iterations=100, n_mad=10)
    hic_ratio[i] = mat.toarray()

ratio = np.log10(hic_ratio[1]) - np.log10(hic_ratio[0])
ratio[np.isnan(ratio)] =  0
ratio[ratio == np.inf] =  0
ratio[ratio == -np.inf] =  0
ratio_rif = np.log10(hic_ratio[3]) - np.log10(hic_ratio[2])
ratio_rif[np.isnan(ratio_rif)] =  0
ratio_rif[ratio_rif == np.inf] =  0
ratio_rif[ratio_rif == -np.inf] =  0

###
# PLOT MATRICES
###

title = ["novo- Rif-", "novo+ Rif-", "novo- Rif+", "novo+ Rif+"]
ratios = [ratio, ratio_rif]
start = 0
end = len(hic[0])
end_ratio = end * res // res_ratio 

fig, ax = plt.subplots(3, 2, figsize=(10, 10))
for i in range(2):
    for j in range(2):
        ax[i, j].imshow(
            hic[i + 2 * j][start:end, start:end],
            vmin=0,
            vmax=0.0012,
            cmap="Reds",
            extent=(start, end, end, start),
        )
    ax[i, j].set_title(title[i + 2 * j])
    ax[2, i].imshow(
        ratios[i][start:end_ratio, start:end_ratio],
        vmin=-2,
        vmax=2,
        cmap="seismic",
        extent=(start, end, end, start),
    )
    ax[2, i].set_title("novo+/novo-")

plt.savefig(outfile)
plt.close()

###
# Plot only ratio.
###

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
ax.imshow(
    ratios[0][start:end_ratio, start:end_ratio],
    vmin=-2,
    vmax=2,
    cmap="seismic",
    extent=(start, end, end, start),
)
ax.set_title("novo+/novo-")

plt.savefig(outfile_ratio, dpi=300)
plt.close()

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
ax.imshow(
    ratios[1][start:end_ratio, start:end_ratio],
    vmin=-2,
    vmax=2,
    cmap="seismic",
    extent=(start, end, end, start),
)
ax.set_title("novo+ Rif+/novo- Rif+")

plt.savefig(outfile_ratio_rif, dpi=300)
plt.close()

###
# Plot zoom without RNAseq.
###

# Rotate matrices.
# hic_rot = copy.copy(hic)
# for i in range(len(hic)):
#     hic_rot[i] = bch.interpolate_white_lines(hic_rot[i])
#     hic_rot[i][np.isnan(hic_rot[i])] = 0
#     hic_rot[i] = ndimage.rotate(hic_rot[i], 45, reshape=True)

# fig, ax = plt.subplots(
#     1, 4, figsize=(20, 10)
# )

# start = 300_000
# end = 480_000

# row1 = len(hic_rot[0]) // 2 - int(np.sqrt(2) * (width / res))
# row2 = len(hic_rot[0]) // 2 + int(np.sqrt(2) * (width / res))
# col1 = int((start // res) * np.sqrt(2))
# col2 = int((end // res) * np.sqrt(2))

# for i in range(len(hic)):
#     # Hicmap
#     im = ax[i].imshow(
#         hic_rot[i][row1:row2, col1:col2],
#         cmap="Reds",
#         vmin=0,
#         vmax=0.002,
#         extent=(col1 / np.sqrt(2), col2 / np.sqrt(2), width // res, -width // res),
#     )
#     ax[i].get_xaxis().set_visible(False)
#     ax[i].set_title(title[i], size=18)
#     if i == 0:
#         ax[i].tick_params(axis="both", labelsize=16)
#         ax[i].set_ylabel("Genomic coordinates (kb)", size=16)
#     else:
#         ax[i].get_yaxis().set_visible(False)

# # Colorbar
# cbar = fig.colorbar(
#     im,
#     ax=ax.ravel().tolist(),
#     shrink=0.5,
#     anchor=(1., 0.5)
# )
# cbar.ax.tick_params(labelsize=16)

# # Adjust space between plot
# plt.subplots_adjust(wspace=0.4, hspace=0.0, left=0.05, right=0.88)
# plt.savefig(out_zoom)
# plt.close()

###
# Plot zoom ratio.
###

start, end = 0, 750
end_ratio = end * res // res_ratio 

fig, ax = plt.subplots(3, 2, figsize=(10, 10))
for i in range(2):
    for j in range(2):
        ax[i, j].imshow(
            hic[i + 2 * j][start:end, start:end],
            vmin=0,
            vmax=0.0012,
            cmap="Reds",
            extent=(start, end, end, start),
        )
    ax[i, j].set_title(title[i + 2 * j])
for i in range(2):
    ax[2, i].imshow(
        ratios[i][start:end_ratio, start:end_ratio],
        vmin=-1,
        vmax=1,
        cmap="seismic",
        extent=(start, end, end, start),
    )
    ax[2, i].set_title("novo+/novo-")

plt.savefig(out_zoom_ratio)
plt.close() 
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
import bacchus.io as bcio
import bacchus.hic as bch
import bacchus.plot as bcp
import bacchus.transcription as bct
import cooler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import scipy.stats as sst
from os.path import dirname, join

# Make sure that the figures directory exists.
os.makedirs(dirname(snakemake.output.pileup), exist_ok=True)
os.makedirs(dirname(snakemake.output.pileup_zoom), exist_ok=True)

# Import parameters
binning = int(snakemake.params.binning)
label = str(snakemake.params.species)
circular = str(snakemake.params.circular)
window = int(snakemake.params.window)
threshold = int(snakemake.params.threshold)
unit_length = int(snakemake.params.unit_length)

# Import RNA tracks.
rna, chrom_start = bcio.extract_big_wig(
    file=str(snakemake.input.rna),
    binning=binning,
    circular=circular,
    sigma=None,
    ztransform=None,
)
# Ugly loop to have the chrom length...
chrom_start_size = {}
for i, name in enumerate(chrom_start):
    if i != 0:
        chrom_start_size[prev_name] = {
            "start": start,
            "length": chrom_start[name] - start,
        }
    prev_name = name
    start = chrom_start[name]
chrom_start_size[prev_name] = {
    "start": start,
    "length": len(rna) - start,
}

# RNA at 1bp.
rna_1bp, chrom_start_1bp = bcio.extract_big_wig(
    file=str(snakemake.input.rna),
    binning=1,
    circular=circular,
    sigma=None,
    ztransform=None,
)

# Extract annotation from GFF.
annotation = pd.DataFrame(
    columns=["type", "chr", "start", "end", "strand", "name", "tss", "tts", "rpkm"]
)
with open(snakemake.input.annotation, "r") as file:
    for line in file:
        if line.startswith("#"):
            continue
        # Stop if fasta sequence at the end or empty line.
        elif line.startswith(">") or (line.startswith("\n")):
            break
        else:
            line = line.split("\t")
            if line[2] == "CDS":
                name = line[8].split("Name=")[-1].split(";")[0]
                annot = {
                    "type": line[2],
                    "chr": line[0],
                    "start": int(line[3]),
                    "end": int(line[4]),
                    "strand": line[6],
                    "name": name,
                }
                annotation = annotation.append(annot, ignore_index=True)

print(annotation)

# Find TSS/TTS positions.
for i in annotation.index:
    if annotation.loc[i, "strand"] == "+":
        annotation.loc[i, "tss"] = annotation.loc[i, "start"]
        annotation.loc[i, "tts"] = annotation.loc[i, "end"]
    else:
        annotation.loc[i, "tts"] = annotation.loc[i, "start"]
        annotation.loc[i, "tss"] = annotation.loc[i, "end"]

# Compute the RPKM values from the genes.
for i in annotation.index:
    chr_start = chrom_start_1bp[annotation.loc[i, "chr"]]
    annotation.loc[i, "rpkm"] = np.nanmean(
        rna_1bp[
            chr_start
            + annotation.loc[i, "start"] : chr_start
            + annotation.loc[i, "end"]
        ]
    )

# Compute coding density.
n = 0
for i in annotation.index:
    n += annotation.loc[i, "end"] - annotation.loc[i, "start"]
coding_density = n / len(rna_1bp)

# Plot RPKM distribution.
plt.hist(annotation.rpkm, bins=25)
plt.text(
    x=np.nanpercentile(annotation.rpkm, 80),
    y=len(annotation.rpkm) // 25,
    s=f"corr={coding_density:.2f}",
)
plt.savefig(snakemake.output.rpkm)

# Compute pileup
for tu_length in [0, 3000]:
    for threshold2 in np.arange(5, 30, 5):
        rna_pileup_pos, _, pileup_pos, _ = bct.pileup_genes(
            clr=cooler.Cooler(f"{snakemake.input.hic}::/resolutions/{binning}"),
            annotation=annotation,
            rna=rna,
            chrom_start_size=chrom_start_size,
            window_size=25000,
            binning=binning,
            threshold=threshold2,
            neg="detrend",
            tu_length=tu_length,
            operation="mean",
            circular=circular,
        )

        # Plot pileup
        # Parameters
        ax_kb = 1000
        window_plot = 25000 // ax_kb

        # Plot
        fig, ax = plt.subplots(
            2, 1, figsize=(8, 13), gridspec_kw={"height_ratios": [7, 3]}
        )
        # RNA
        ax[1].axvline(
            0, color="black", linestyle="dashed", linewidth=1.5, alpha=0.4
        )
        ax[1].tick_params(axis="both", labelsize=14)
        ax[1].plot(
            np.arange(
                -window_plot,
                window_plot + (1 * binning / ax_kb),
                binning / ax_kb,
            ),
            rna_pileup_pos,
        )
        ax[1].set_ylabel("Transcription (CPM)", fontsize=15)
        ax[1].set_xlabel("Genomic distance (kb)", fontsize=15)
        # HiC
        if tu_length == 0:
            vmax = 0.012
        else:
            vmax = 0.008
        ax[0].get_xaxis().set_visible(False)
        im = ax[0].imshow(
            pileup_pos**0.8,
            cmap="Reds",
            vmin=0,
            vmax=vmax,
            extent=[-window_plot, window_plot, window_plot, -window_plot],
        )
        # ax[0].axvline(0, color="black", linestyle="dashed", linewidth=1.5, alpha=0.4)
        # ax[0].axhline(0, color="black", linestyle="dashed", linewidth=1.5, alpha=0.4)
        ax[0].tick_params(axis="both", labelsize=14)
        ax[0].set_ylabel("Genomic distance (kb)", fontsize=15)

        # Add colorbar, title and savefig
        fig.colorbar(
            im, ax=ax.ravel().tolist(), shrink=0.33, anchor=(1.3, 0.75)
        )
        plt.subplots_adjust(hspace=0.1)
        ax[0].set_title(label, fontsize=20)
        plt.savefig(
            join(
                snakemake.params.out_dir,
                f"pileup_pos_{threshold2}_TU{tu_length}.pdf",
            ),
            dpi=200,
        )

        hic_signal = bch.compute_hic_signal(
            pileup_pos, binning=500, start=0, stop=5000
        )
        print(
            f"{label} (Gene) - Pearson correlation: {sst.pearsonr(hic_signal[10:40], rna_pileup_pos[10:40])[0]:.2f}"
        )
        print(
            f"{label} (Gene) - p-value: {sst.pearsonr(hic_signal[10:40], rna_pileup_pos[10:40])[1]:.5f}"
        )

# Log ratio pileup
rna_pileup_pos, rna_pileup_neg, pileup_pos, pileup_neg = bct.pileup_genes(
    clr=cooler.Cooler(f"{snakemake.input.hic}::/resolutions/{binning}"),
    annotation=annotation,
    rna=rna,
    chrom_start_size=chrom_start_size,
    window_size=window,
    binning=binning,
    threshold=threshold,
    neg="random-neighbor",
    tu_length=unit_length,
    operation="mean",
    circular=circular,
)

# Print correlation:
hic_signal = bch.compute_hic_signal(pileup_pos, binning=500, start=0, stop=5000)
print(
    f"{label} (TU) - Pearson correlation: {sst.pearsonr(hic_signal[10:window//500 - 10], rna_pileup_pos[10:window//500 - 10])[0]:.2f}"
)
print(
    f"{label} (TU) - p-value: {sst.pearsonr(hic_signal[10:window//500 - 10], rna_pileup_pos[10:window//500 - 10])[1]:.5f}"
)

# Plot pileup pos.
# Plot pileup
# Parameters
ax_kb = 1000
window_plot = 25000 // ax_kb

# Plot pos
fig, ax = plt.subplots(
    2, 1, figsize=(8, 13), gridspec_kw={"height_ratios": [7, 3]}
)
# RNA
ax[1].axvline(0, color="black", linestyle="dashed", linewidth=1.5, alpha=0.4)
ax[1].tick_params(axis="both", labelsize=14)
ax[1].plot(
    np.arange(
        -window_plot,
        window_plot + (1 * binning / ax_kb),
        binning / ax_kb,
    ),
    rna_pileup_pos[window // 500 - 50 : window // 500 + 51],
)
ax[1].set_ylabel("Transcription (CPM)", fontsize=15)
ax[1].set_xlabel("Genomic distance (kb)", fontsize=15)
# HiC
ax[0].get_xaxis().set_visible(False)
im = ax[0].imshow(
    pileup_pos[
        window // 500 - 50 : window // 500 + 51,
        window // 500 - 50 : window // 500 + 51,
    ]
    ** 0.8,
    cmap="Reds",
    vmin=0.003,
    vmax=0.007,
    extent=[-window_plot, window_plot, window_plot, -window_plot],
)
# ax[0].axvline(0, color="black", linestyle="dashed", linewidth=1.5, alpha=0.4)
# ax[0].axhline(0, color="black", linestyle="dashed", linewidth=1.5, alpha=0.4)
ax[0].tick_params(axis="both", labelsize=14)
ax[0].set_ylabel("Genomic distance (kb)", fontsize=15)

# Add colorbar, title and savefig
fig.colorbar(im, ax=ax.ravel().tolist(), shrink=0.33, anchor=(1.3, 0.75))
plt.subplots_adjust(hspace=0.1)
ax[0].set_title(label, fontsize=20)
plt.savefig(snakemake.output.pileup_pos_TU, dpi=200)

# Plot neg
fig, ax = plt.subplots(
    2, 1, figsize=(8, 13), gridspec_kw={"height_ratios": [7, 3]}
)
# RNA
ax[1].axvline(0, color="black", linestyle="dashed", linewidth=1.5, alpha=0.4)
ax[1].tick_params(axis="both", labelsize=14)
ax[1].plot(
    np.arange(
        -window_plot,
        window_plot + (1 * binning / ax_kb),
        binning / ax_kb,
    ),
    rna_pileup_neg[window // 500 - 50 : window // 500 + 51],
)
ax[1].set_ylabel("Transcription (CPM)", fontsize=15)
ax[1].set_xlabel("Genomic distance (kb)", fontsize=15)
# HiC
ax[0].get_xaxis().set_visible(False)
im = ax[0].imshow(
    pileup_neg[
        window // 500 - 50 : window // 500 + 51,
        window // 500 - 50 : window // 500 + 51,
    ]
    ** 0.8,
    cmap="Reds",
    vmin=0,
    vmax=np.nanpercentile(
        pileup_pos[
            window // 500 - 50 : window // 500 + 51,
            window // 500 - 50 : window // 500 + 51,
        ]
        ** 0.8,
        99,
    ),
    extent=[-window_plot, window_plot, window_plot, -window_plot],
)
# ax[0].axvline(0, color="black", linestyle="dashed", linewidth=1.5, alpha=0.4)
# ax[0].axhline(0, color="black", linestyle="dashed", linewidth=1.5, alpha=0.4)
ax[0].tick_params(axis="both", labelsize=14)
ax[0].set_ylabel("Genomic distance (kb)", fontsize=15)

# Add colorbar, title and savefig
fig.colorbar(im, ax=ax.ravel().tolist(), shrink=0.33, anchor=(1.3, 0.75))
plt.subplots_adjust(hspace=0.1)
ax[0].set_title(label, fontsize=20)
plt.savefig(snakemake.output.pileup_neg_TU, dpi=200)

# Plot pileup genes

# Plot the pileup ratio.
bcp.pileup_plot(
    pileup=pileup_pos,
    pileup_control=pileup_neg,
    gen_tracks=[rna_pileup_pos],
    gen_tracks_control=[rna_pileup_neg],
    binning=binning,
    window=50000,
    ratio="log",
    out_file=snakemake.output.pileup,
    title=label,
    dpi=200,
    vmax=0.2,
)
bcp.pileup_plot(
    pileup=pileup_pos,
    pileup_control=pileup_neg,
    gen_tracks=[rna_pileup_pos],
    gen_tracks_control=[rna_pileup_neg],
    binning=binning,
    window=25000,
    ratio="log",
    out_file=snakemake.output.pileup_zoom,
    title=label,
    dpi=200,
    vmax=0.2,
)
bcp.pileup_plot(
    pileup=pileup_pos,
    pileup_control=pileup_neg,
    gen_tracks=[rna_pileup_pos],
    gen_tracks_control=[rna_pileup_neg],
    binning=binning,
    window=10000,
    ratio="log",
    out_file=snakemake.output.pileup_zoom2,
    title=label,
    dpi=200,
    vmax=0.2,
)
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
import bacchus.hic as bch
import bacchus.io as bcio
import bacchus.plot as bcp
import cooler
import copy
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import os
from os.path import join
import pandas as pd
from scipy import ndimage
import seaborn as sns

# Import snakemake parameters.
annotation_file = snakemake.input.annotation
mat_wt_file = str(snakemake.input.cool_wt)
mat_rf_file = str(snakemake.input.cool_rf)
rna_wt_file = snakemake.input.rna_wt
rna_rf_file = snakemake.input.rna_rf
cov_wt_file = snakemake.input.cov_wt
cov_rf_file = snakemake.input.cov_rf
gc_file = snakemake.input.gc
epod_file = snakemake.input.EPODs
res = int(snakemake.params.res)
cmap = snakemake.params.cmap
outdir = str(snakemake.params.outdir)
width = snakemake.params.width
pos = snakemake.params.positions

# Create outdir if necessary.
os.makedirs(outdir, exist_ok=True)


def import_annotation_gff(annotation_file):
    """Function to create a table of the gene positions from the gff file."""
    annotation = pd.DataFrame(
        columns=["type", "start", "end", "strand", "name"]
    )
    with open(annotation_file, "r") as file:
        for line in file:
            # Header.
            if line.startswith("#"):
                continue
            # Stop at the fasta sequences.
            elif line.startswith(">"):
                break
            else:
                line = line.split("\t")
                if line[2] in ["gene", "tRNA", "rRNA"]:
                    if line[2] == "gene":
                        name = line[8].split("Name=")[-1].split(";")[0]
                        # Extract gene position.
                        annot = {
                            "type": line[2],
                            "start": int(line[3]),
                            "end": int(line[4]),
                            "strand": line[6],
                            "gene_name": name,
                        }
                        annotation = annotation.append(annot, ignore_index=True)
    return annotation


# Import files.
annotation = import_annotation_gff(annotation_file)
mat_wt = cooler.Cooler(f"{mat_wt_file}::/resolutions/{res}").matrix(
    balance=True, sparse=False
)[:]
mat_wt[np.isnan(mat_wt)] = 0
mat_rf = cooler.Cooler(f"{mat_rf_file}::/resolutions/{res}").matrix(
    balance=True, sparse=False
)[:]
mat_rf[np.isnan(mat_rf)] = 0
rna_wt, _ = bcio.extract_big_wig(rna_wt_file, binning=100)
rna_rf, _ = bcio.extract_big_wig(rna_rf_file, binning=100)
cov_wt, _ = bcio.extract_big_wig(cov_wt_file, binning=1000)
cov_rf, _ = bcio.extract_big_wig(cov_wt_file, binning=1000)
gc_content = pd.read_csv(gc_file, sep="\t", header=None).iloc[:, 1]
epod = pd.read_csv(epod_file, sep="\t", header=None)

# Make the rotation of the matrix to plot them
mat_wt_rot = copy.copy(mat_wt)
mat_wt_rot = bch.interpolate_white_lines(mat_wt_rot)
mat_wt_rot[np.isnan(mat_wt_rot)] = 0
mat_wt_rot = ndimage.rotate(mat_wt_rot, 45, reshape=True)
mat_rf_rot = copy.copy(mat_rf)
mat_rf_rot = bch.interpolate_white_lines(mat_rf_rot)
mat_rf_rot[np.isnan(mat_rf_rot)] = 0
mat_rf_rot = ndimage.rotate(mat_rf_rot, 45, reshape=True)


def plot_region(
    M1,
    M1_rot,
    M2,
    M2_rot,
    rna1,
    rna2,
    cov1,
    cov2,
    annotation,
    binning,
    width,
    zoom_ini,
    outfile,
    split=False,
):
    rna_binning = 100
    pal = sns.color_palette("Paired")

    # Defined values to specify the borders of the matrices and tables
    zoom = [zoom // binning for zoom in zoom_ini]
    width = width / 1000
    row1 = len(M1_rot) // 2 - int(np.sqrt(2) * width)
    row2 = len(M1_rot) // 2 + int(np.sqrt(2) * width)
    col1 = int(zoom[0] * np.sqrt(2))
    col2 = int(zoom[1] * np.sqrt(2))
    annotation_zoom = annotation[
        np.logical_and(
            annotation.start > zoom_ini[0], annotation.end < zoom_ini[1]
        )
    ]
    ymax = np.nanmax(
        np.concatenate(
            (
                rna1[zoom_ini[0] // rna_binning : zoom_ini[1] // rna_binning],
                rna2[zoom_ini[0] // rna_binning : zoom_ini[1] // rna_binning],
            )
        )
    )
    max_cov = np.nanmax(cov1) * 1.02
    min_cov = np.nanmin(cov1) * 0.98

    # Define panels depending on split or not.
    if split and ymax > 1500:
        a = 4
        fig, ax = plt.subplots(
            7,
            2,
            figsize=(16, 12),
            gridspec_kw={"height_ratios": [1, 30, 1, 6, 12, 3, 3]},
            sharex=False,
        )

        # Parameters to merge panel 2 and 3.
        d = 0.02
        D = 0.04
        for j in range(2):
            ax[a - 1, j].set_xlim(zoom_ini[0] // 1000, zoom_ini[1] // 1000)
            ax[a - 1, j].tick_params(axis="both", labelsize=15)
            ax[a - 1, j].spines["bottom"].set_visible(False)
            ax[a - 1, j].spines["top"].set_visible(False)
            ax[a - 1, j].spines["right"].set_visible(False)
            ax[a - 1, j].set_ylim(ymax - 525, ymax)
            ax[a - 1, j].get_xaxis().set_visible(False)

            # Add the small cut between the two panels
            kwargs = dict(
                transform=ax[a - 1, j].transAxes, color="k", clip_on=False
            )
            ax[a - 1, j].plot((-d, +d), (-D, +D), **kwargs)  # top-left diagonal
            kwargs.update(
                transform=ax[a, j].transAxes
            )  # switch to the bottom axes
            ax[a, j].plot(
                (-d, +d), (1 - d, 1 + d), **kwargs
            )  # bottom-left diagonal

        # Plot the RNA on the split panel
        ax[a - 1, 0].fill_between(
            np.arange(
                zoom_ini[0] // 1000, zoom_ini[1] // 1000, rna_binning / 1000
            ),
            rna1[zoom_ini[0] // rna_binning : zoom_ini[1] // rna_binning],
            color="black",
        )
        ax[a - 1, 1].fill_between(
            np.arange(
                zoom_ini[0] // 1000, zoom_ini[1] // 1000, rna_binning / 1000
            ),
            rna2[zoom_ini[0] // rna_binning : zoom_ini[1] // rna_binning],
            color="black",
        )

    else:
        a = 3
        fig, ax = plt.subplots(
            6,
            2,
            figsize=(16, 12),
            gridspec_kw={"height_ratios": [1, 30, 1, 15, 3, 3]},
            sharex=False,
        )

    # Plot expressed genes - blue are forward - red are reversed
    for j in range(2):
        ax[0, j].set_xlim(zoom_ini[0], zoom_ini[1])
        ax[0, j].get_xaxis().set_visible(False)
        ax[0, j].get_yaxis().set_visible(False)
        ax[2, j].set_xlim(zoom_ini[0], zoom_ini[1])
        ax[2, j].get_xaxis().set_visible(False)
        ax[2, j].get_yaxis().set_visible(False)
    pos = 1
    for i in range(len(annotation_zoom)):
        # Extract annotation information
        annot = annotation_zoom.iloc[
            i,
        ]
        strand = annot.strand
        start = annot.start // rna_binning
        end = annot.end // rna_binning
        name = annot.gene_name
        # Defined color depending on the strand
        if strand == "+":
            color = pal.as_hex()[1]
        else:
            color = pal.as_hex()[5]
        # Print it only if it transcribed (10% most transcribed genes)
        if np.mean(rna1[start:end]) >= 120.7628998139441:
            pos = pos * -1
            for j in range(2):
                ax[0, j].add_patch(
                    patches.Rectangle(
                        (start * 100, 0),
                        (end - start) * 100,
                        1,
                        edgecolor=color,
                        facecolor=color,
                        fill=True,
                    )
                )
                ax[0, j].text(
                    x=(start + ((end - start) / 2)) * 100,
                    y=pos + 0.5,
                    s=name,
                    rotation=90 * pos,
                    wrap=True,
                )
    color = pal.as_hex()[3]
    for i in epod.index:
        start = epod.loc[i, 3]
        end = epod.loc[i, 4]
        if (
            (start > zoom_ini[0])
            and (start < zoom_ini[1])
            or (end > zoom_ini[0])
            and (end < zoom_ini[1])
        ):
            start = max(start, zoom_ini[0]) // rna_binning
            end = min(end, zoom_ini[1]) // rna_binning
            for j in range(2):
                ax[2, j].add_patch(
                    patches.Rectangle(
                        (start * 100, 0),
                        (end - start) * 100,
                        1,
                        edgecolor=color,
                        facecolor=color,
                        fill=True,
                    )
                )
    #                 ax[2, j].text(
    #                     x=(start + ((end - start) / 2)) * 100,
    #                     y=pos + 0.5,
    #                     s="EPOD",
    #                     rotation=90 * pos,
    #                     wrap=True,
    #                 )

    # Plot the matrices
    ax[1, 0].set_ylabel("Genomic distance (kb)", fontsize=15)
    for j in range(2):
        ax[1, j].get_xaxis().set_visible(False)
        ax[1, j].tick_params(axis="both", labelsize=15)

    # Plot the matrice 1
    im = ax[1, 0].imshow(
        M1_rot[row1:row2, col1:col2],
        cmap=cmap,
        interpolation="none",
        vmin=0,
        vmax=np.nanpercentile(M1[zoom[0] : zoom[1], zoom[0] : zoom[1]], 97),
        extent=(col1 / np.sqrt(2), col2 / np.sqrt(2), width, -width),
    )

    # Plot the matrice 2
    ax[1, 1].imshow(
        M2_rot[row1:row2, col1:col2],
        cmap=cmap,
        interpolation="none",
        vmin=0,
        vmax=np.nanpercentile(M1[zoom[0] : zoom[1], zoom[0] : zoom[1]], 97),
        extent=(col1 / np.sqrt(2), col2 / np.sqrt(2), width, -width),
    )

    # RNAseq legends and plot at the bottom panel.
    ax[a, 0].set_ylabel("RNA count (CPM)", fontsize=15)
    for j in range(2):
        #         ax[a, j].set_xlabel("Genomic coordinates (kb)", size=15)
        ax[a, j].get_xaxis().set_visible(False)
        ax[a, j].tick_params(axis="both", labelsize=15)
        ax[a, j].set_xlim(zoom_ini[0] // 1000, zoom_ini[1] // 1000)
        ax[a, j].spines["top"].set_visible(False)
        ax[a, j].spines["right"].set_visible(False)
        if split:
            if ymax > 1500:
                ax[a, j].set_ylim(0, 1050)
            else:
                ax[a, j].set_ylim(0, 1500)
        else:
            ax[a, j].set_ylim(0, ymax)

    # Plot RNA at the bottom panel
    ax[a, 0].fill_between(
        np.arange(zoom_ini[0] // 1000, zoom_ini[1] // 1000, rna_binning / 1000),
        rna1[zoom_ini[0] // rna_binning : zoom_ini[1] // rna_binning],
        color="black",
    )
    ax[a, 1].fill_between(
        np.arange(zoom_ini[0] // 1000, zoom_ini[1] // 1000, rna_binning / 1000),
        rna2[zoom_ini[0] // rna_binning : zoom_ini[1] // rna_binning],
        color="black",
    )

    # GC content
    gc_binning = 100
    for j in range(2):
        ax[a + 1, j].plot(
            np.arange(
                zoom_ini[0] // 1000, zoom_ini[1] // 1000, gc_binning / 1000
            ),
            gc_content[
                (zoom_ini[0] + 250)
                // gc_binning : (zoom_ini[1] + 250)
                // gc_binning
            ],
            color="black",
        )
        ax[a + 1, j].set_xlim(zoom_ini[0] // 1000, zoom_ini[1] // 1000)
        ax[a + 1, j].tick_params(axis="both", labelsize=15)
        ax[a + 1, j].get_xaxis().set_visible(False)
    ax[a + 1, 0].set_ylabel("GC", fontsize=15)

    # Coverage
    cov_binning = 1000
    ax[a + 2, 0].plot(
        np.arange(zoom_ini[0] // 1000, zoom_ini[1] // 1000, cov_binning / 1000),
        cov1[
            (zoom_ini[0] + 250)
            // cov_binning : (zoom_ini[1] + 250)
            // cov_binning
        ],
        color="black",
    )
    ax[a + 2, 1].plot(
        np.arange(zoom_ini[0] // 1000, zoom_ini[1] // 1000, cov_binning / 1000),
        cov2[
            (zoom_ini[0] + 250)
            // cov_binning : (zoom_ini[1] + 250)
            // cov_binning
        ],
        color="black",
    )
    for j in range(2):
        ax[a + 2, j].set_xlim(zoom_ini[0] // 1000, zoom_ini[1] // 1000)
        ax[a + 2, j].set_ylim(min_cov, max_cov)
        ax[a + 2, j].set_xlabel("Genomic coordinates (kb)", size=15)
        ax[a + 2, j].tick_params(axis="both", labelsize=15)
    ax[a + 2, 0].set_ylabel("HiC\ncoverage\n(CPM)", fontsize=15)

    # Colorbar
    cbar = fig.colorbar(
        im, ax=ax.ravel().tolist(), shrink=0.3, anchor=(1.2, 0.75)
    )
    cbar.ax.tick_params(labelsize=15)

    # Save the fig and adjust margins
    if split and ymax > 1500:
        plt.subplots_adjust(wspace=0.2, hspace=0.1)
    else:
        plt.subplots_adjust(wspace=0.2, hspace=0.1)
    plt.savefig(outfile, bbox_inches="tight", dpi=200)


for split in [True, False]:
    if split:
        outfile = join(
            outdir,
            f"region_{pos[0]}_{pos[1]}_split.pdf",
        )
    else:
        outfile = join(
            outdir,
            f"region_{pos[0]}_{pos[1]}.pdf",
        )
    position = [int(p) * 1000 for p in pos]
    plot_region(
        mat_wt,
        mat_wt_rot,
        mat_rf,
        mat_rf_rot,
        rna_wt,
        rna_rf,
        cov_wt,
        cov_rf,
        annotation,
        res,
        width,
        position,
        outfile,
        split=split,
    )
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import bacchus.hic as bch
import cooler
import hicstuff.hicstuff as hcs
import matplotlib.pyplot as plt
import numpy as np
import copy 
import os
from scipy import ndimage
import bacchus.plot as bcp

# Import snakemake.
wt_file = snakemake.input.mat_wt
pompA_file = snakemake.input.mat_pompA
prpsM_file = snakemake.input.mat_prpsM
cmap = snakemake.params.cmap
res = snakemake.params.res
width = snakemake.params.width

out_wt = str(snakemake.output.wt)
out_pompA = str(snakemake.output.pompA)
out_prpsM = str(snakemake.output.prpsM)

# Make sure output directory exists.
os.makedirs(str((snakemake.params.outdir)), exist_ok=True)

files = [wt_file, pompA_file, prpsM_file]
outs = [out_wt, out_pompA, out_prpsM]
title = ['WT', "pompA", "prpsM"]
hic = [0, 0, 0]
start = 300000 // res
end = 420000 // res
subsample = 24593739
for i, file in enumerate(files):
    clr = cooler.Cooler(f"{file}::/resolutions/{res}")
    mat = clr.matrix(balance=False, sparse=True)[: :]
    print(mat.sum())
    mat = hcs.subsample_contacts(mat, subsample) 
    mat = hcs.normalize_sparse(mat, norm='ICE', iterations=100, n_mad=10)
    hic[i] = mat.toarray()
    bcp.contact_map(
        hic[i],
        binning=res,
        title=title[i],
        zmax=0.002,
        out_file=outs[i],
        start=start,
        end=end,
    )
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
from pickletools import markobject
import bacchus.hic as bch
import bacchus.io as bcio
import cooler
import copy
import hicstuff.hicstuff as hcs
import matplotlib.pyplot as plt
import numpy as np
import os
from scipy import ndimage
import serpentine as srp

# Import snakemake.
mat_t7_glu_file = snakemake.input.mat_t7_glu
mat_t7_ara_file = snakemake.input.mat_t7_ara
mat_t7_glu_rif_file = snakemake.input.mat_t7_glu_rif
mat_t7_ara_rif_file = snakemake.input.mat_t7_ara_rif
rna_t7_glu = snakemake.input.rna_t7_glu
rna_t7_ara = snakemake.input.rna_t7_ara
rna_t7_glu_rif = snakemake.input.rna_t7_glu_rif
rna_t7_ara_rif = snakemake.input.rna_t7_ara_rif

cmap = snakemake.params.cmap
res = snakemake.params.res
res_ratio = snakemake.params.res_ratio
width = snakemake.params.width
cpus = snakemake.threads

outfile = str(snakemake.output.matrices)
out_zoom = str(snakemake.output.zoom)
out_zoom_ratio = str(snakemake.output.zoom_ratio)
out_rna = str(snakemake.output.rna_only)

# Make sure output directory exists.
os.makedirs(str(snakemake.params.outdir), exist_ok=True)

# Import matrices
mat_t7_glu = cooler.Cooler(f"{mat_t7_glu_file}::/resolutions/{res}").matrix(
    balance=True, sparse=False
)[:]
mat_t7_glu[np.isnan(mat_t7_glu)] = 0
mat_t7_ara = cooler.Cooler(f"{mat_t7_ara_file}::/resolutions/{res}").matrix(
    balance=True, sparse=False
)[:]
mat_t7_ara[np.isnan(mat_t7_ara)] = 0
mat_t7_glu_rif = cooler.Cooler(f"{mat_t7_glu_rif_file}::/resolutions/{res}").matrix(
    balance=True, sparse=False
)[:]
mat_t7_glu_rif[np.isnan(mat_t7_glu_rif)] = 0
mat_t7_ara_rif = cooler.Cooler(f"{mat_t7_ara_rif_file}::/resolutions/{res}").matrix(
    balance=True, sparse=False
)[:]
mat_t7_ara_rif[np.isnan(mat_t7_ara_rif)] = 0

# Do the ratio
def contact_map_ratio(
    M1,
    M2,
    iterations=10,
    threshold=10,
    cpus=16,
):
    """Function to compute the log ratio of two matrices and use serpentine to
    return a smoother log ratio.
    """
    M1 = M1.tocoo()
    M2 = M2.tocoo()
    m1 = M1.sum()
    m2 = M2.sum()
    if m1 < m2:
        M2 = hcs.subsample_contacts(M2, int(m1))
    else:
        M1 = hcs.subsample_contacts(M1, int(m2))
    subsample = min(m1, m2)
    print("Matrices have been subsampled to {0} contacts.".format(subsample))
    # M1 = M1.tocsr() - M1.tocsr().multiply(
    #     get_win_density(M1, win_size=3, sym_upper=True) < 2 / 9
    # )
    # M2 = M2.tocsr() - M2.tocsr().multiply(
    #     get_win_density(M2, win_size=3, sym_upper=True) < 2 / 9
    # )
    M1 = bch.get_symmetric(M1).toarray()
    M2 = bch.get_symmetric(M2).toarray()
    if threshold == "auto":
        trend, threshold = srp.MDbefore(M1, M2, show=False)
    else:
        threshold = int(threshold)
    M1_serp, M2_serp, ratio_srp = srp.serpentin_binning(
        M1,
        M2,
        parallel=cpus,
        triangular=False,
        threshold=threshold,
        minthreshold=threshold / 5,
        iterations=iterations,
        verbose=False,
    )
    ratio_log = np.log2(M2) - np.log2(M1)
    return M1_serp, M2_serp, ratio_log, ratio_srp, subsample

mat_t7_glu_sparse = cooler.Cooler(f"{mat_t7_glu_file}::/resolutions/{res_ratio}").matrix(
    balance=False, sparse=True
)[:]
mat_t7_ara_sparse = cooler.Cooler(f"{mat_t7_ara_file}::/resolutions/{res_ratio}").matrix(
    balance=False, sparse=True
)[:]
mat_t7_glu_rif_sparse = cooler.Cooler(f"{mat_t7_glu_rif_file}::/resolutions/{res_ratio}").matrix(
    balance=False, sparse=True
)[:]
mat_t7_ara_rif_sparse = cooler.Cooler(f"{mat_t7_ara_rif_file}::/resolutions/{res_ratio}").matrix(
    balance=False, sparse=True
)[:]

(
    _,
    _,
    ratio_log_glu,
    ratio_srp_glu,
    subsample,
) = contact_map_ratio(mat_t7_glu_sparse, mat_t7_ara_sparse, iterations=25, threshold=100, cpus=cpus)
(
    _,
    _,
    ratio_log_ara,
    ratio_srp_ara,
    subsample,
) = contact_map_ratio(mat_t7_glu_rif_sparse, mat_t7_ara_rif_sparse, iterations=25, threshold=100, cpus=cpus)

###
# PLOT MATRICES
###

title = ["Glucose", "Arabinose", "Glucose + Rif", "Arabinose + Rif"]
hic = [mat_t7_glu, mat_t7_ara, mat_t7_glu_rif, mat_t7_ara_rif]
ratio = [
    ratio_srp_glu - ratio_srp_glu.mean(),
    ratio_srp_ara - ratio_srp_ara.mean(),
]
start = 0
end = len(hic[0])

fig, ax = plt.subplots(2, 3, figsize=(10, 10))
for i in range(2):
    for j in range(2):
        ax[i, j].imshow(
            hic[i + 2 * j][start:end, start:end],
            vmin=0,
            vmax=np.nanpercentile(hic[0], 99),
            cmap="Reds",
            extent=(start, end, end, start),
        )
        ax[i, j].set_title(title[i + 2 * j])
    ax[i, 2].imshow(
        ratio[i][start:end, start:end],
        vmin=-2,
        vmax=2,
        cmap="seismic",
        extent=(start, end, end, start),
    )
    ax[i, 2].set_title("Rif+/Rif-")

plt.savefig(outfile)
plt.close()

###
# Plot zoom ratio.
###

start, end = 260 // 5, 560 // 5
fig, ax = plt.subplots(2, 1, figsize=(5, 10))
for i in range(2):
    ax[i].imshow(
        ratio[i][start:end, start:end],
        vmin=-2,
        vmax=2,
        cmap="seismic",
        extent=(start, end, end, start),
    )
plt.savefig(out_zoom_ratio)
plt.close()

###
# Plot zoom with RNAseq.
###

# Rotate matrices.
hic_rot = copy.copy(hic)
for i in range(len(hic)):
    hic_rot[i] = bch.interpolate_white_lines(hic_rot[i])
    hic_rot[i][np.isnan(hic_rot[i])] = 0
    hic_rot[i] = ndimage.rotate(hic_rot[i], 45, reshape=True)

# Import RNA
rna_binning = 100
rna_t7_glu, _ = bcio.extract_big_wig(rna_t7_glu, rna_binning)
rna_t7_ara, _ = bcio.extract_big_wig(rna_t7_ara, rna_binning)
rna_t7_glu_rif, _ = bcio.extract_big_wig(rna_t7_glu_rif, rna_binning)
rna_t7_ara_rif, _ = bcio.extract_big_wig(rna_t7_ara_rif, rna_binning)
rna = [rna_t7_glu, rna_t7_ara, rna_t7_glu_rif, rna_t7_ara_rif]

fig, ax = plt.subplots(
    2, 4, figsize=(20, 10), gridspec_kw={"height_ratios": [8, 4]}, sharex=False
)

start = 300_000
end = 480_000

row1 = len(hic_rot[0]) // 2 - int(np.sqrt(2) * (width / res))
row2 = len(hic_rot[0]) // 2 + int(np.sqrt(2) * (width / res))
col1 = int((start // res) * np.sqrt(2))
col2 = int((end // res) * np.sqrt(2))

for i in range(4):
    # Hicmap
    im = ax[0, i].imshow(
        hic_rot[i][row1:row2, col1:col2],
        cmap="Reds",
        vmin=0,
        vmax=np.nanpercentile(hic[0], 99),
        extent=(col1 / np.sqrt(2), col2 / np.sqrt(2), width // res, -width // res),
    )
    ax[0, i].get_xaxis().set_visible(False)
    ax[0, i].set_title(title[i], size=18)
    if i == 0:
        ax[0, i].tick_params(axis="both", labelsize=16)
        ax[0, i].set_ylabel("Genomic coordinates (kb)", size=16)
    else:
        ax[0, i].get_yaxis().set_visible(False)

    # RNAseq
    ax[1, i].set_xlabel("Genomic coordinates (kb)", size=16)
    if i == 0:
        ax[0, i].tick_params(axis="both", labelsize=16)
        ax[1, i].set_ylabel("Transcription (CPM)", size=16)
    else:
        ax[0, i].get_yaxis().set_visible(False)

    ax_rna = ax[1, i]
    p1 = ax_rna.plot(
        np.arange(start / 1000, end / 1000, rna_binning / 1000),
        rna[i][int(start / rna_binning) : int(end / rna_binning)],
        color="k",
        label="RNAseq",
        alpha=0.7,
        linewidth=1,
    )
    ax_rna.set_ylim(0, 8000)
    ax_rna.tick_params(axis="both", labelsize=16, color="k", labelcolor="k")
    ax_rna.spines["top"].set_visible(False)
    ax_rna.spines["right"].set_visible(False)

# Colorbar
cbar = fig.colorbar(im, ax=ax.ravel().tolist(), shrink=0.5, anchor=(1.2, 0.8))
cbar.ax.tick_params(labelsize=16)

# Adjust space between plot
plt.subplots_adjust(wspace=0.4, hspace=0.0, left=0.05, right=0.88)
plt.savefig(out_zoom)
plt.close()

# Plot RNAseq only
start = 300_000
end = 560_000

fig, ax = plt.subplots(1, 1, figsize=(10, 5))
ax.set_xlabel("Genomic coordinates (kb)", size=16)
ax.set_ylabel("Transcription (CPM)", size=16)
for i in [0, 1, 3]:
    ax.plot(
        np.arange(start / 1000, end / 1000, rna_binning / 1000),
        rna[i][int(start / rna_binning) : int(end / rna_binning)],
        label=title[i],
        alpha=0.7,
        linewidth=1,
    )
ax.set_ylim(0, 1500)
ax.tick_params(axis="both", labelsize=16, color="k", labelcolor="k")
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
plt.legend()
plt.savefig(out_rna)
plt.close()
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
from pickletools import markobject
import bacchus.hic as bch
import bacchus.io as bcio
import cooler
import copy
import hicstuff.hicstuff as hcs
import matplotlib.pyplot as plt
import numpy as np
import os
from scipy import ndimage
import serpentine as srp

# Import snakemake.
mat_t7_glu_file = snakemake.input.mat_t7_glu
mat_t7_ara_file = snakemake.input.mat_t7_ara
mat_t7_glu_rif_file = snakemake.input.mat_t7_glu_rif
mat_t7_ara_rif_file = snakemake.input.mat_t7_ara_rif
rna_t7_glu = snakemake.input.rna_t7_glu
rna_t7_ara = snakemake.input.rna_t7_ara
rna_t7_glu_rif = snakemake.input.rna_t7_glu_rif
rna_t7_ara_rif = snakemake.input.rna_t7_ara_rif
chip_t7_glu = snakemake.input.chip_t7_glu
chip_t7_ara = snakemake.input.chip_t7_ara
chip_t7_glu_rif = snakemake.input.chip_t7_glu_rif
chip_t7_ara_rif_R1 = snakemake.input.chip_t7_ara_rif_R1
chip_t7_ara_rif_R2 = snakemake.input.chip_t7_ara_rif_R2

cmap = snakemake.params.cmap
res = snakemake.params.res
res_ratio = snakemake.params.res_ratio
width = snakemake.params.width
cpus = snakemake.threads

outdir = str(snakemake.params.outdir)
outfile = str(snakemake.output.matrices)
out_zoom = str(snakemake.output.zoom)
out_zoom_ratio = str(snakemake.output.zoom_ratio)

# Make sure output directory exists.
os.makedirs(outdir, exist_ok=True)

# Import matrices
mat_t7_glu = cooler.Cooler(f"{mat_t7_glu_file}::/resolutions/{res}").matrix(
    balance=True, sparse=False
)[:]
mat_t7_glu[np.isnan(mat_t7_glu)] = 0
mat_t7_ara = cooler.Cooler(f"{mat_t7_ara_file}::/resolutions/{res}").matrix(
    balance=True, sparse=False
)[:]
mat_t7_ara[np.isnan(mat_t7_ara)] = 0
mat_t7_glu_rif = cooler.Cooler(f"{mat_t7_glu_rif_file}::/resolutions/{res}").matrix(
    balance=True, sparse=False
)[:]
mat_t7_glu_rif[np.isnan(mat_t7_glu_rif)] = 0
mat_t7_ara_rif = cooler.Cooler(f"{mat_t7_ara_rif_file}::/resolutions/{res}").matrix(
    balance=True, sparse=False
)[:]
mat_t7_ara_rif[np.isnan(mat_t7_ara_rif)] = 0

# Do the ratio
def contact_map_ratio(
    M1,
    M2,
    iterations=10,
    threshold=10,
    cpus=16,
):
    """Function to compute the log ratio of two matrices and use serpentine to
    return a smoother log ratio.
    """
    M1 = M1.tocoo()
    M2 = M2.tocoo()
    m1 = M1.sum()
    m2 = M2.sum()
    if m1 < m2:
        M2 = hcs.subsample_contacts(M2, int(m1))
    else:
        M1 = hcs.subsample_contacts(M1, int(m2))
    subsample = min(m1, m2)
    print("Matrices have been subsampled to {0} contacts.".format(subsample))
    # M1 = M1.tocsr() - M1.tocsr().multiply(
    #     get_win_density(M1, win_size=3, sym_upper=True) < 2 / 9
    # )
    # M2 = M2.tocsr() - M2.tocsr().multiply(
    #     get_win_density(M2, win_size=3, sym_upper=True) < 2 / 9
    # )
    M1 = bch.get_symmetric(M1).toarray()
    M2 = bch.get_symmetric(M2).toarray()
    if threshold == "auto":
        trend, threshold = srp.MDbefore(M1, M2, show=False)
    else:
        threshold = int(threshold)
    M1_serp, M2_serp, ratio_srp = srp.serpentin_binning(
        M1,
        M2,
        parallel=cpus,
        triangular=False,
        threshold=threshold,
        minthreshold=threshold / 5,
        iterations=iterations,
        verbose=False,
    )
    ratio_log = np.log2(M2) - np.log2(M1)
    return M1_serp, M2_serp, ratio_log, ratio_srp, subsample

mat_t7_glu_sparse = cooler.Cooler(f"{mat_t7_glu_file}::/resolutions/{res_ratio}").matrix(
    balance=False, sparse=True
)[:]
mat_t7_ara_sparse = cooler.Cooler(f"{mat_t7_ara_file}::/resolutions/{res_ratio}").matrix(
    balance=False, sparse=True
)[:]
mat_t7_glu_rif_sparse = cooler.Cooler(f"{mat_t7_glu_rif_file}::/resolutions/{res_ratio}").matrix(
    balance=False, sparse=True
)[:]
mat_t7_ara_rif_sparse = cooler.Cooler(f"{mat_t7_ara_rif_file}::/resolutions/{res_ratio}").matrix(
    balance=False, sparse=True
)[:]

(
    _,
    _,
    ratio_log_glu,
    ratio_srp_glu,
    subsample,
) = contact_map_ratio(mat_t7_glu_sparse, mat_t7_ara_sparse, iterations=25, threshold=100, cpus=cpus)
(
    _,
    _,
    ratio_log_ara,
    ratio_srp_ara,
    subsample,
) = contact_map_ratio(mat_t7_glu_rif_sparse, mat_t7_ara_rif_sparse, iterations=25, threshold=100, cpus=cpus)


###
# PLOT MATRICES
###

title = ["Glucose", "Arabinose", "Glucose + Rif", "Arabinose + Rif"]
hic = [mat_t7_glu, mat_t7_ara, mat_t7_glu_rif, mat_t7_ara_rif]
ratio = [
    ratio_srp_glu - ratio_srp_glu.mean(),
    ratio_srp_ara - ratio_srp_ara.mean(),
]
start = 0
end = len(hic[0])

fig, ax = plt.subplots(2, 3, figsize=(10, 10))
for i in range(2):
    for j in range(2):
        ax[i, j].imshow(
            hic[i + 2 * j][start:end, start:end],
            vmin=0,
            vmax=np.nanpercentile(hic[0], 99),
            cmap="Reds",
            extent=(start, end, end, start),
        )
        ax[i, j].set_title(title[i + 2 * j])
    ax[i, 2].imshow(
        ratio[i][start:end, start:end],
        vmin=-2,
        vmax=2,
        cmap="seismic",
        extent=(start, end, end, start),
    )
    ax[i, 2].set_title("Rif+/Rif-")

plt.savefig(outfile, dpi=300)
plt.close()

###
# Plot zoom ratio.
###

start, end = 260 // 5, 560 // 5
fig, ax = plt.subplots(2, 1, figsize=(5, 10))
for i in range(2):
    ax[i].imshow(
        ratio[i][start:end, start:end],
        vmin=-2,
        vmax=2,
        cmap="seismic",
        extent=(start, end, end, start),
    )
plt.savefig(out_zoom_ratio)
plt.close()

###
# Plot zoom.
###

# Rotate matrices.
hic_rot = copy.copy(hic)
for i in range(len(hic)):
    hic_rot[i] = bch.interpolate_white_lines(hic_rot[i])
    hic_rot[i][np.isnan(hic_rot[i])] = 0
    hic_rot[i] = ndimage.rotate(hic_rot[i], 45, reshape=True)

start = 240_000
end = 420_000

row1 = len(hic_rot[0]) // 2 - int(np.sqrt(2) * (width / res))
row2 = len(hic_rot[0]) // 2 + int(np.sqrt(2) * (width / res))
col1 = int((start // res) * np.sqrt(2))
col2 = int((end // res) * np.sqrt(2))

# Import RNA
rna_binning = 100
rna_t7_glu, _ = bcio.extract_big_wig(rna_t7_glu, rna_binning)
rna_t7_ara, _ = bcio.extract_big_wig(rna_t7_ara, rna_binning)
rna_t7_glu_rif, _ = bcio.extract_big_wig(rna_t7_glu_rif, rna_binning)
rna_t7_ara_rif, _ = bcio.extract_big_wig(rna_t7_ara_rif, rna_binning)
rna = [rna_t7_glu, rna_t7_glu_rif, rna_t7_ara, rna_t7_ara_rif]

# Import ChIP
chip_binning = 100
chip_t7_glu, _ = bcio.extract_big_wig(chip_t7_glu, chip_binning)
chip_t7_ara, _ = bcio.extract_big_wig(chip_t7_ara, chip_binning)
chip_t7_glu_rif, _ = bcio.extract_big_wig(chip_t7_glu_rif, chip_binning)
chip_t7_ara_rif_R1, _ = bcio.extract_big_wig(chip_t7_ara_rif_R1, chip_binning)
chip_t7_ara_rif_R2, _ = bcio.extract_big_wig(chip_t7_ara_rif_R2, chip_binning)
T7_pol = [chip_t7_glu, chip_t7_glu_rif, chip_t7_ara, chip_t7_ara_rif_R1]

fig, ax = plt.subplots(
    3,
    4,
    figsize=(20, 6),
    gridspec_kw={"height_ratios": [15, 7, 7]},
    sharex=True,
)
for j in range(4):

    # HiC
    im = ax[0, j].imshow(
        hic_rot[j][row1:row2, col1:col2],
        cmap="Reds",
        vmax=0.002,
        vmin=0,
        extent=(col1 / np.sqrt(2), col2 / np.sqrt(2), width // res, -width // res),
    )

    # RNAseq
    ax[1, j].fill_between(
        np.arange(start / res, end / res, 0.1), rna[j][start // 100 : end // 100], color="k"
    )
    ax[1, j].set_ylim(0, 5000)

    # T7 pol Chip_seq
    ax[2, j].plot(
        np.arange(start / res, end / res, 0.1),
        T7_pol[j][start  // 100:end // 100],
    )
    ax[2, j].set_ylim(0, 5000)

    # Legend
    ax[0, j].set_title(title[j], size=16)
    ax[0, j].tick_params(axis="both", labelsize=12)
    ax[2, j].tick_params(axis="both", labelsize=12)
    if j != 0:
        ax[0, j].get_yaxis().set_visible(False)

    ax[1, j].set_xlabel("Genomic coordinates (kb)", fontsize=14)
ax[0, 0].set_ylabel("Genomic\ndistance (kb)", fontsize=14)
ax[1, 0].set_ylabel("Transcription\n(CPM)", fontsize=14)


# Colorbar
cbar = fig.colorbar(im, ax=ax.ravel().tolist(), shrink=0.3, anchor=(-0.2, 0.4))
cbar.ax.tick_params(labelsize=15)
plt.savefig(out_zoom)
tool / pypi

scipy

.. image:: https://raw.githubusercontent.com/scipy/scipy/main/doc/source/_static/logo.svg :target: https://scipy.org :width: 110 :height: 110 :align: left .. image:: https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A :target: https://numfocus.org .. image:: https://img.shields.io/pypi/dm/scipy.svg?label=Pypi%20downloads :target: https://pypi.org/project/scipy/ .. image:: https://img.shields.io/conda/dn/conda-forge/scipy.svg?label=Conda%20downloads :target: https://anaconda.org/conda-forge/scipy .. image:: https://img.shields.io/badge/stackoverflow-Ask%20questions-blue.svg :target: https://stackoverflow.com/questions/tagged/scipy .. image:: https://img.shields.io/badge/DOI-10.1038%2Fs41592--019--0686--2-blue :target: https://www.nature.com/articles/s41592-019-0686-2 SciPy (pronounced "Sigh Pie") is an open-source software for mathematics, science, and engineering. It includes modules for statistics, optimization, integration, li