TYTUS - UBCS statistics calculation tool

public public 1yr ago 0 bookmarks

Software used for calculatioin of the UBCS Statistics. Results were presented during talk "Revised time estimation of the ancestral human chromosome 2 fusion" at the INTERNATIONAL SYMPOSIUM ON BIOINFORMATICS RESEARCH AND APPLICATIONS (ISBRA) December 1-4, 2020

Usage

All computations presented in the article can be reproduced using 2 Snakemake workflows:

  • "Snakefile_downloads.smk" to download all the needed data

  • "Snakefile" to reproduce all subsequent computations

LICENCE

Software is released under MIT LICENCE

Code Snippets

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
from utils import get_chrom_sizes
from consts import QUALITY_WINDOW_LENGTH, MAX_NUMBER_OF_MISMACHES

from Bio.Seq import Seq

alignment_file = snakemake.input['alignment_file']
snd_file = snakemake.output['snd_file']
chrom_sizes_file = snakemake.input['query_chrom_sizes_file']

chrom_sizes = get_chrom_sizes(chrom_sizes_file)

query_name = snakemake.wildcards['query']
target_name = snakemake.wildcards['target']
ffrom = snakemake.params['ffrom']

snp_file = snakemake.input['snp_file']

def get_snps(snp_file):

    snps = {}

    with open(snp_file) as f:

        for line in f:

            line = line.split()

            start_coord = int(line[1]) + 1
            end_coord = int(line[2]) + 1

            for i in range(start_coord, end_coord):
                base = line[4]
                snps[i] = (base, line)

    return snps


def get_axt_entry(f, ffrom):

    header = f.readline()

    if not header: return None

    header = header.split()

    strand = header[7]

    if ffrom == 'target':

        t_chrom = header[1]
        t_start = int(header[2])

        q_chrom = header[4]
        q_start = int(header[5])
        q_end = int(header[6])

        target_al_seq = f.readline().rstrip().upper()
        query_al_seq = f.readline().rstrip().upper()

    else:

        t_chrom = header[4]
        t_start = int(header[5])

        q_chrom = header[1]
        q_start = int(header[2])

        query_al_seq = f.readline().rstrip().upper()
        target_al_seq =  f.readline().rstrip().upper()           

    f.readline()

    return t_chrom, t_start, q_chrom, q_start, strand, target_al_seq, query_al_seq 

def add_snds_from_axt_entry(t_chrom, t_start, q_chrom, q_start, strand, target_al_seq, query_al_seq, snd_bed_entries):

    t_count = 0
    q_count = 0

    for i, (t_chr, q_chr) in enumerate(zip(target_al_seq, query_al_seq)):

        if t_chr != '-':
            t_count += 1

        if q_chr != '-':
            q_count += 1

        if t_chr == q_chr:
            continue

        t_window = target_al_seq[i - QUALITY_WINDOW_LENGTH: i + QUALITY_WINDOW_LENGTH + 1]
        q_window = query_al_seq[i - QUALITY_WINDOW_LENGTH: i + QUALITY_WINDOW_LENGTH + 1]

        if len(t_window) != 2 * QUALITY_WINDOW_LENGTH + 1:  continue

        #print('enough place')

        if '-' in t_window + q_window: continue

        #print('without deletions and insertions')

        if len([ 1  for t_win_chr, q_win_chr  in zip(t_window, q_window) if t_win_chr != q_win_chr]) > MAX_NUMBER_OF_MISMACHES: continue

        #print('number of  mismatches above max threshold')

        t_subst_start = t_start + t_count - 2

        if strand == '-' and ffrom == 'query':
            t_subst_start = chrom_sizes[t_chrom] - (t_start + t_count - 1)
            t_window = str(Seq(t_window).reverse_complement())
            q_window = str(Seq(q_window).reverse_complement())

        t_coord_start = str(t_subst_start - QUALITY_WINDOW_LENGTH)
        t_coord_end =  str(t_subst_start + QUALITY_WINDOW_LENGTH + 1)

        t_coords = t_chrom + ':' + t_coord_start + '-' + t_coord_end

        q_subst_start = q_start + q_count - 2

        if strand == '-' and ffrom == 'target':
            q_subst_start = chrom_sizes[q_chrom] - (q_start + q_count - 1)

        q_coord_start = str(q_subst_start - QUALITY_WINDOW_LENGTH)
        q_coord_end = str(q_subst_start + QUALITY_WINDOW_LENGTH + 1)

        q_coords = q_chrom + ':' + q_coord_start + '-' + q_coord_end

        t_snp =  int(t_coord_start) +  QUALITY_WINDOW_LENGTH + 1 in snps

        #if t_output_window[5] != snps[int(t_coord_start)+ 6][0]:
        #    print(t_output_window, t_output_window[5], snps[int(t_coord_start) + 6]

        name = ('SND_ID:'  + str(len(snd_bed_entries)) , 'QUERY:' + query_name, 'TARGET:'+ target_name,
                'TARGET_COORDS:' + t_coords, 'QUERY_COORDS:' + q_coords,
                'CHANGE:' +  t_window + '>' + q_window, 'FROM:' + ffrom, 'SNP:' + str(t_snp), 'STRAND:' + strand)

        name = ';'.join(name)

        snd_bed_entry = '\t'.join([t_chrom, t_coord_start, t_coord_end, name, '0', strand ])

        snd_bed_entries[int(t_coord_start)] = snd_bed_entry

def get_snd_bed_entries(alignment_file, snps):

    with open(alignment_file) as f:

        snd_bed_entries = {}

        while True:

            axt_entry = get_axt_entry(f, ffrom)

            if axt_entry is None:
                break

            t_chrom, t_start, q_chrom, q_start, strand, target_al_seq, query_al_seq = axt_entry

            add_snds_from_axt_entry(t_chrom, t_start, q_chrom, q_start, strand, target_al_seq, query_al_seq, snd_bed_entries)

        return snd_bed_entries

def save_snd_bed_entries(snd_bed_entries_file, snd_bed_entries):

    with open(snd_file, 'w') as f_out:

        for t_coord_start in sorted(snd_bed_entries):
            snd_bed_entry = snd_bed_entries[t_coord_start]
            f_out.write(snd_bed_entry)
            f_out.write('\n')

snps = get_snps(snp_file)
snd_bed_entries = get_snd_bed_entries(alignment_file, snps)
save_snd_bed_entries(snd_file, snd_bed_entries)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from collections import defaultdict
from SND import get_SNDs_derived_in_target
from SND import get_SNDs_derived_in_query
from SNDWindow import SNDWindow

liftover_fasta_file = snakemake.input['liftover_fasta_file']
ubcs_stats_file = snakemake.output['ubcs_stats_file']
ubcs_stats_details_file = snakemake.output['ubcs_stats_details_file']
ubcs_stats_log_file = snakemake.output['ubcs_stats_log_file']

derived_in_type = snakemake.wildcards['derived']
query = snakemake.wildcards['query']
target = snakemake.wildcards['target']
outgroup = snakemake.wildcards['outgroup']
ffrom = snakemake.wildcards['ffrom']
with_snps = snakemake.wildcards['with_snps'] == 'True'

REGION_LEN = int(snakemake.wildcards['region'])
chrom = snakemake.wildcards['chromosome']
WINDOW_SIZE =  int(snakemake.wildcards['window_size'])
NUMBER_OF_BINS = int(snakemake.wildcards['number_of_bins'])

def get_UBCS_stats():

    with  open(ubcs_stats_file, 'w') as f_out, open(ubcs_stats_details_file, 'w') as f_details_out, open(ubcs_stats_log_file, 'w') as f_log_out:        

        region_snds_number = defaultdict(int)
        region_biased_snds_number = defaultdict(int)
        region_p = defaultdict(int)
        region_expected_BCS = defaultdict(int)
        region_actual_BCS = defaultdict(int)

        if derived_in_type == 'target':
            SNDs = get_SNDs_derived_in_target(liftover_fasta_file)
        else:
            SNDs = get_SNDs_derived_in_query(liftover_fasta_file)

        if not with_snps:
            SNDs = [ SND for SND in SNDs if not SND.is_snp()]

        for SND in SNDs:

            region = SND.target_coord() // REGION_LEN
            region_snds_number[region] += 1

            if SND.biased():
                region_biased_snds_number[region] += 1

        for region in  region_snds_number:
            region_p[region] = region_biased_snds_number[region] / region_snds_number[region]

        for index, SND in enumerate(SNDs):
            coord = SND.target_coord()

            region = coord // REGION_LEN

            window = SNDWindow(index, snds = SNDs , window_size = WINDOW_SIZE, number_of_bins = NUMBER_OF_BINS)

            if WINDOW_SIZE == NUMBER_OF_BINS and window.get_max_frequency_of_cluster() >= 23:
                f_log_out.write(str(coord) + ' ' +  str(window.get_compressed_window_counts()) + '\n')
                window = SNDWindow(index, snds = SNDs , window_size = WINDOW_SIZE, number_of_bins = 20)

            snd_is_biased = SND.biased()
            is_clustered = window.is_clustered()
            is_biased_clustered = window.is_biased_clustered()

            p =  region_p[region]
            prob_of_bcs = window.get_prob_of_bcs(p)
            region_actual_BCS[region] += int(is_biased_clustered)
            region_expected_BCS[region] += prob_of_bcs
            o = chrom,  REGION_LEN, WINDOW_SIZE, NUMBER_OF_BINS, derived_in_type, target, query, outgroup, ffrom, with_snps, coord, p, snd_is_biased, is_clustered, is_biased_clustered, prob_of_bcs
            f_details_out.write('\t'.join(map(str, o)))
            f_details_out.write('\n')

        for region in region_p:
            p = region_p[region]
            expected_BCS = region_expected_BCS[region]
            actual_BCS =  region_actual_BCS[region]
            o = chrom, region * REGION_LEN, (region + 1) * REGION_LEN -1, WINDOW_SIZE, NUMBER_OF_BINS, derived_in_type, target, query, outgroup, ffrom, with_snps, p, expected_BCS, actual_BCS, actual_BCS - expected_BCS
            f_out.write('\t'.join(map(str, o)))
            f_out.write('\n')

get_UBCS_stats()               
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
suppressMessages(library('liftOver'))
suppressMessages(library('rtracklayer'))

chain_file <- snakemake@input[['chain_file']]
SNDs_file <- snakemake@input[['SNDs_file']]
liftover_file <- snakemake@output[['liftover_file']]

chain <- import.chain(chain_file)
SNDs <- import(SNDs_file, format = 'BED')
strand(SNDs) <- '+'

after_liftover <- liftOver(SNDs , chain)
after_liftover <- unlist(after_liftover)

export(after_liftover, liftover_file, format = 'BED')
21
22
script:
    'scripts/python/getUBCSFastStats.py'
30
31
script:
    'scripts/R/liftover.R'
40
41
shell:
    'bedtools getfasta -s -name -fi {input.ref_file} -bed {input.liftover_file} > {output}'
52
53
script:
    'scripts/python/getSNDsFromAXT.py'
64
65
script:
    'scripts/python/getSNDsFromAXT.py'
72
73
shell:
    'samtools faidx {input}'	    
ShowHide 4 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/bposzewiecka/tytus
Name: tytus
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: MIT License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...