process aligned isoseq libraries to: 1. identify consensus gene models ; 2. compare gene model maintenance between samples

public public 1yr ago 0 bookmarks

This is a Snakemake project template. The Snakefile is under workflow .

Slides describing and justifying the use of this template.

Code Snippets

14
15
16
17
18
19
20
21
22
23
24
run:
   # chekc if bam
   if(input["read"][-4:]==".bam"):
       shell("bedtools bamtofastq -i {input.read} -fq /dev/stdout | gzip > {output.fastq}")
   elif(input["read"][-6:]==".fastq" or input["read"][-3:]==".fq"):
       shell("cp {input.read} {output.fastq}")
       shell("gzip {output.fastq}")
   elif( input["read"][-9:]==".fastq.gz" or input["read"][-6:]==".fq.gz" ):
       shell("ln -s {input.read} {output.fastq}")
   else:
       raise Exception(f"Input Error : iso_seq supported formats are fastqs and bams. File passed : {input.read}")
40
41
42
    shell:'''
        seqkit seq -j {threads} --min-len {params.min_len} --min-qual {params.min_qual} {input.fastq} -o - | gzip -c > {output.fastq}
'''
53
54
55
56
57
58
59
60
61
62
63
64
65
66
run:
    import gzip
    from Bio import SeqIO
    outs = output["fastq"]
    with gzip.open( input.fastq , "rt") as handle:
        recs = list( SeqIO.parse(handle, "fastq") )
        nrecs = len(recs)
        nouts = len(outs)

        for idx, out in enumerate(outs):
            start = int( idx*nrecs/nouts)
            end = int(  (idx + 1)*nrecs/nouts)
            print(start, end, nrecs)
            SeqIO.write(recs[start:end], out, "fastq")
92
93
94
    shell:"""
{MMCMD} -t {threads} -d {output.mmi} {input.fasta}
"""
108
109
110
    shell:"""
{MMCMD} -t {threads} -d {output.mmi} {input.fasta}
"""
133
134
135
    shell:"""
    {MMCMD} -t {threads} -d {output.mmi} {input.fasta}
"""
154
155
156
157
158
159
    shell:"""
{MMCMD} -t {threads} \
    {input.mmi} {input.fasta} | \
    samtools view -F 2052 -b - | \
    samtools sort -T tmp/{wildcards.SPRPOP}_{wildcards.SMP}_{wildcards.frac}_{wildcards.ref_name} -m {resources.mem_mb}M - > {output.bam}
"""
174
175
176
177
178
179
180
181
shell:"""
    if [ $(echo {input.bams} | wc -w)  -eq 1 ]; then
        ln -s {input.bams} {output.bam}
    else
        samtools merge -@ {threads} {output.bam} {input.bams}
    fi
    samtools index -@ {threads} {output.bam}
"""
205
206
207
208
209
210
    shell:"""
{MMCMD} -t {threads} \
    {input.mmi} {input.fasta} | \
    samtools view -F 2052 -b - | \
    samtools sort -T tmp/{wildcards.SPRPOP}_{wildcards.SMP}_{wildcards.frac}_{wildcards.t2t_version} -m {resources.mem_mb}M - > {output.bam}
"""
225
226
227
228
229
230
231
232
shell:"""
    if [ $(echo {input.bams} | wc -w)  -eq 1 ]; then
        ln -s {input.bams} {output.bam}
    else
        samtools merge -@ {threads} {output.bam} {input.bams}
    fi
    samtools index -@ {threads} {output.bam}
"""
250
251
252
253
254
255
    shell:"""
{MMCMD} -t {threads} \
    {input.mmi} {input.fasta} | \
    samtools view -F 2052 -b - | \
    samtools sort -T tmp/{wildcards.SPRPOP}_{wildcards.SMP}_{wildcards.frac}_hg38 -m {resources.mem_mb}M - > {output.bam}
"""
268
269
270
271
272
273
274
275
shell:"""
    if [ $(echo {input.bams} | wc -w)  -eq 1 ]; then
        ln -s {input.bams} {output.bam}
    else
        samtools merge -@ {threads} {output.bam} {input.bams}
    fi
    samtools index -@ {threads} {output.bam}
"""
15
16
17
    shell:"""
rb stats {input.bam} > {output.stats}    
"""
36
37
38
39
40
41
    shell:"""
isoseq3 collapse -j {threads} --log-file {log} {input.bam} {output.gff}
sed -i 's/gene_id /gene_id=/g' {output.gff} 
sed -i 's/transcript_id /transcript_id=/g' {output.gff} #add equals signs to attributes
sed -i 's/"//g' {output.gff} #get rid of double quotes
"""
61
62
63
64
65
66
    shell:"""
blat -t=dna -q=dna -minScore=100 -maxIntron=500 -minMatch=3 {input.ref} {input.loc_seq} {output.temp_mapping_psl}
tail -n +6 {output.temp_mapping_psl} | cut -f14,16,17,10,9 | \
    awk 'BEGIN {{FS="\\t"; OFS="\\t"}} {{print $3,$4,$5,"{wildcards.loc_name}" , ".",$1}}' | bedtools sort | awk 'BEGIN {{FS="\\t"; OFS="\\t"}}{{$4=$4"_"NR; print $0}}' | \
    awk -v min_len={config[min_map_len]} '{{ if (($3 - $2) > min_len) print }}' > {output.mapping_bed}
"""
86
87
88
89
90
91
92
93
94
95
96
    shell:"""
minimap2 -a -k19 -w5 --splice -g 2k  -A1 -B2 -O2,32 \
    -E1,0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0 --splice-flank=no -G50k \
    --secondary=yes -N 100 \
    {input.ref} {input.can_mRNA} | \
    samtools view -F 2052 -b - | \
    samtools sort -@ 4 - > {output.temp_bam}

bedtools intersect -a {output.temp_bam} -b {input.ref_loc_bed} -wa -wb -bed | \
    awk 'BEGIN{{OFS=FS}}{{$4=$16; print}}' > {output.bed12}
"""
116
117
118
119
    shell:"""
bedtools intersect -abam {input.bam} -b {input.ref_loc_bed} | samtools sort > {output.bam} 
samtools index {output.bam}
"""
136
137
138
    shell:"""
    rb stats {input.locus_bam} > {output.stats}
"""
167
168
169
    shell:'''
bedtools intersect -header -a {input.gff} -b {input.locus_bed} -wb | awk 'BEGIN{{FS="\\t"; OFS="\\t"}}{{$9=$9 " paralog="$13";" ;print $1,$2,$3,$4,$5,$6,$7,$8,$9}}' > {output.locus_gff} 
'''
187
script: "../scripts/get_top_paralog_isoforms.py"
211
script: "../scripts/get_long_supported_isoforms.py"
230
231
232
233
234
235
236
237
238
239
    shell:'''
top_isoforms=( $(cut -f 2 {input.isoform_tbl} | tail -n +2) )
for isoform in "${{top_isoforms[@]}}"; do
    if grep "${{isoform}};" {input.intron_gff} >> {output.subset_gff}; then
        echo "match_found"
    else
        echo "no match found for ${{isoform}} in {wildcards.SMP}"
    fi
done
'''
257
258
259
260
    shell:'''
agat_sp_add_introns.pl --gff {input.gff} --out {output.temp_intron_gff}
bedtools sort -i {output.temp_intron_gff} > {output.intron_gff}
'''
18
script: "../scripts/plot_dot_plots.R"
25
26
27
28
    shell:"""
    bedtools getfasta -fi {input.ref} -bed {input.locus_bed} -name -fo {output.fa}
    samtools faidx {output.fa}
"""
45
46
47
48
    shell:"""
    fold -w 80 {input.ref} > {output.tmp_folded_ref}
    samtools faidx {output.tmp_folded_ref}
"""
66
67
68
69
    shell:'''
agat_sp_add_introns.pl --gff {input.gff} --out {output.temp_intron_gff}
bedtools sort -i {output.temp_intron_gff} > {output.intron_gff}
'''
87
88
89
90
91
    shell:"""
    agat_sp_extract_sequences.pl --gff {input.isoform_gff} --fasta {input.ref} -t intron --keep_attributes --merge --output {output.fa}
    sed -i -n '/^>/ s/.*paralog=\([^ ]*\).*transcript_id=\([^ ]*\).*/>{wildcards.SMP}__\\1__\\2/p; /^>/! p' {output.fa} #fix names
    samtools faidx {output.fa}
"""
109
110
111
112
113
    shell:"""
    agat_sp_extract_sequences.pl --gff {input.gff} --fasta {input.ref} -t intron --merge --keep_attributes --output {output.fa}
    sed -i 's/[^>]*>\([^ ]*\) \(.*\)/>\\1/' {output.fa} #get rid of extranious info for later running ORFfinder
    samtools faidx {output.fa}
"""
131
132
133
134
135
    shell:"""
    agat_sp_extract_sequences.pl --gff {input.gff} --fasta {input.ref} -t exon --merge --keep_attributes --output {output.fa}
    sed -i 's/[^>]*>\([^ ]*\) \(.*\)/>\\1/' {output.fa} #get rid of extranious info for later running ORFfinder
    samtools faidx {output.fa}
"""
154
155
156
157
158
    shell:"""
    agat_sp_extract_sequences.pl --gff {input.isoform_gff} --fasta {input.ref} -t exon --merge --keep_attributes --output {output.fa}
    sed -i 's/[^>]*>\([^ ]*\) \(.*\)/>\\1/' {output.fa} #get rid of extranious info for later running ORFfinder
    samtools faidx {output.fa}
"""
177
178
179
180
181
182
183
    shell:'''
    orfipy {input.mRNA_fa} --dna $( basename "{output.orf_fa}" ) --pep $( basename "{output.aa_fa}") --outdir $( dirname "{output.orf_fa}" ) --min 100 --max 10000 --start ATG
    sed -i 's/[^>]*>\([^ ]*\) \(.*\)/>\\1/' {output.orf_fa} #get rid of extranious info for later running ORFfinder
    sed -i 's/[^>]*>\([^ ]*\) \(.*\)/>\\1/' {output.aa_fa} #get rid of extranious info for later running ORFfinder
    samtools faidx {output.orf_fa}
    samtools faidx {output.aa_fa}
'''
202
203
204
205
206
207
208
209
210
211
212
213
    shell:'''
    #sort by longest isoform
    sort -nr -k 2,2 {input.aa_fai} > {output.tmp_sorted_fai} 2> {log}
    #get top (process each paralog separately)
    mapfile -t paralog_array < <(cut -f 1 {output.tmp_sorted_fai} | sort | sed "s/\.[0-9]\+_ORF\.[0-9]\+//g" | sort | uniq) #get array of paralog names
    # Print the array elements
    for cur_paralog in "${{paralog_array[@]}}"; do
        grep "${{cur_paralog}}\." {output.tmp_sorted_fai} | sort -nr -k 2,2 > {output.tmp_fai} 
        top_paralog=$( head -n 1 {output.tmp_fai} | cut -f 1 )
        printf "${{top_paralog}}\n" >> {output.isoform_list} 2> {log}
    done
'''
233
234
235
236
237
    shell:'''
    sed 's/_.*//g' {input.lst} > {output.tmp_isoform_tbl}
    seqtk subseq {input.intron_fa} {output.tmp_isoform_tbl} > {output.intron_fa}
    seqtk subseq {input.mRNA_fa} {output.tmp_isoform_tbl} > {output.mRNA_fa}
''' 
256
257
258
259
    shell:'''
    seqtk subseq {input.orf_fa} {input.lst} > {output.orf_fa}
    seqtk subseq {input.aa_fa} {input.lst} > {output.aa_fa}
''' 
277
278
279
    shell:'''
    orfipy {input.mRNA_fa} --dna $( basename "{output.orf_fa}" ) --pep $( basename "{output.aa_fa}") --outdir $( dirname "{output.orf_fa}" ) --min 100 --max 10000 --start ATG
'''
297
script: "../scripts/rename_sequence_fa_reads.py"
315
script: "../scripts/rename_sequence_fa_reads.py"
333
script: "../scripts/rename_sequence_fa_reads.py"
351
script: "../scripts/rename_sequence_fa_reads.py"
371
script: "../scripts/process_orf_aa_fastas.py"
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pandas as pd
import os
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation


canonical_bed12_path = snakemake.input['canonical_bed12'] 
abundance_tbl_path = snakemake.input['abundance_tbl']
ref_gene_mappings_bed_path = snakemake.input['ref_gene_mappings_bed']
isoform_gff_path = snakemake.input['isoform_gff ']

#step 1: filter abundance tbl 
MIN_ISO_COUNT = snakemake.params["min_abundance"]
abund_tbl = pd.read_csv(abundance_tbl_path, comment = "#", sep = "\t")
abund_tbl = abund_tbl.loc[abund_tbl["count_fl"] > MIN_ISO_COUNT,:]


#step 2 filter TBC gff with canonical gff
#step 2a. load bed12
bed_12_cols = ["contig", "start", "end", "aln", ".", "strand", "_start2", "_end2", "rgb", "n_exons", "exon_size", "exon_start"]
bed12 = pd.read_csv(ref_mRNA_bed12_path, sep = "\t", usecols=list(range(12)), names = bed_12_cols)
bed12 = bed12.set_index("aln", drop = False)

#load gff file
trans_df = pd.DataFrame(columns = ['id', 'start', 'end', 'strand', 'paralog', 'exon_start', 'exon_size'], )
trans_df = trans_df.set_index("id", drop = False)
with open(isoform_gff_path, 'r') as file:
    for line in file:
        # Skip comment lines starting with '#'
        if line.startswith('#'):
            continue
        columns = line.strip().split('\t')
        attributes = columns[8].strip().split(';')
        attribute_dict = {key_value.split('=')[0]: key_value.split('=')[1] for key_value in attributes}
        if columns[2] == "transcript":
            if "nbis" in attribute_dict["ID"]: #not sure why, but gff has some extra annotations with an nbis- name in ID...
                continue
            new_transcript = pd.Series({"id" : attribute_dict["transcript_id"] , "start" : int(columns[3]), "end" : int(columns[4]), 
                                        "strand" : columns[6], "paralog" : attribute_dict["paralog"],
                                        "exon_start" : [], "exon_size" : []})
            trans_df = pd.concat([trans_df, new_transcript.to_frame().T.set_index("id", drop = False)])

#add exons : note : can't put in same for loop because sometimes run into exons for transcripts that haven't been found yet
with open(isoform_gff_path, 'r') as file:
    for line in file:
        # Skip comment lines starting with '#'
        if line.startswith('#'):
            continue
        columns = line.strip().split('\t')
        attributes = columns[8].strip().split(';')
        attribute_dict = {key_value.split('=')[0]: key_value.split('=')[1] for key_value in attributes}
        if columns[2] == "exon":
            assert attribute_dict["transcript_id"] in trans_df.index, f"not finding {attribute_dict['transcript_id']} in trans_df index"
            cur_start = int(columns[3])
            cur_size = int(columns[4]) - cur_start
            cur_strand = columns[6]
            trans_df.loc[attribute_dict["transcript_id"], "exon_start"].append(cur_start)
            trans_df.loc[attribute_dict["transcript_id"], "exon_size"].append(cur_size)

#filter on length
FLANK_TOLERANCE = snakemake.params["flank_tolerance"]
length_filter_isos = pd.DataFrame(columns = ['id', 'start', 'end', 'strand', 'paralog', 'exon_start', 'exon_size'], )
for transcript_id, row in trans_df.iterrows():
    if(row.paralog not in bed12.index):
        missed += 1
        continue
    cur_aln = bed12.loc[row.paralog]   
    if(row.start <= cur_aln.start + 100 and row.end >= cur_aln.end - 100):
        length_filter_isos = pd.concat([length_filter_isos, row.to_frame().T] )
#only take isoforms that fall in filtering of abundance_tbl
#now take al these that also have enough copies
keep_isos = pd.DataFrame(columns = ['id', 'start', 'end', 'strand', 'paralog', 'exon_start', 'exon_size', 'abundance'], )
for transcript_id, row in length_filter_isos.iterrows():
    if(transcript_id in list(abund_tbl['pbid'])):
        row['abundance'] = int(abund_tbl.loc[abund_tbl["pbid"]== transcript_id, "count_fl"])
        #keep largest transcript
        if(row.paralog in list(keep_isos['paralog']) ):
            compare_row = keep_isos.loc[ keep_isos['paralog'] == row.paralog]
            if(sum(row.exon_size) > sum(compare_row.exon_size[0] ) ):
                keep_isos = keep_isos.drop([compare_row.id[0]])
                keep_isos = pd.concat([keep_isos, row.to_frame().T] )
        else:
            keep_isos = pd.concat([keep_isos, row.to_frame().T] )

#write to tbl
keep_isos.to_csv(snakemake.output["keep_isos_tbl"], sep = "\t", header=True, index = False )
#write list
with open(snakemake.output["keep_isos_lst"], "w") as file:
    for element in my_list:
        file.write(element + '\n')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import pandas as pd

gff_file = snakemake.input['locus_gff']

#get paralog isoforms
paralog_isoforms = {}
with open(gff_file, 'r') as file:
    # Open the output GFF file for writing
    for line in file:
        # Skip comment lines starting with '#'
        if line.startswith('#'):
            continue
        # Split the line by tab to separate the columns
        columns = line.strip().split('\t')
        # Split the 9th column by semicolon to separate the key-value pairs
        attributes = columns[8].strip().split(';')
        # Iterate through the key-value pairs
        for attribute in attributes:
            if attribute.strip() == "":
                continue
            # Split each key-value pair by '=' to get the key and value
            key, value = attribute.strip().split('=')
            # Check if the attribute key and value match your desired criteria
            if key == 'paralog':
                # Write the line to the output file
                cur_paralog = value
            elif key == 'transcript_id':
                cur_isoform = value
        if cur_paralog not in list(paralog_isoforms.keys()):
            paralog_isoforms[cur_paralog] = set([cur_isoform])
        else:
            paralog_isoforms[cur_paralog].add(cur_isoform)

#get abundance of isoforms
abundance_file = snakemake.input.abundance_tbl
abundance_df = pd.read_csv(abundance_file, comment="#", sep = "\t")
abundance_dict = dict(zip(abundance_df['pbid'], abundance_df['count_fl']))

#get top isoforms for each paralog
out_tbl = pd.DataFrame()
for cur_paralog, isoforms in paralog_isoforms.items():
    max_value_key = None
    max_value = float('-inf')
    for key in list(isoforms):
        assert key in abundance_dict , f"Error: not finding {key} in abundance dictionary"
        if abundance_dict[key] > max_value:
            max_value_key = key
            max_value = abundance_dict[key]
    out_ser = pd.Series({"paralog" : cur_paralog , "top_isoform" : max_value_key , "n_reads" : max_value })
    out_tbl = pd.concat( [out_tbl, out_ser.to_frame().T ])

#write to output
out_tbl.to_csv(path_or_buf=snakemake.output.tbl ,sep = "\t", header=True, index = False )
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
library(tidyverse)
library(GenomicRanges)
library(ggplot2)
library(glue)
#cur_nhp = snakemake.wildcards[["SPRPOP"]]
#cur_sample = snakemake.wildcards[["SMP"]]
cur_tbl_path = snakemake@input[["tbl"]]
regions_path <- snakemake@input[["rgns"]]
cur_nhp = snakemake@params[['cur_nhp']]
cur_sample = snakemake@params[['cur_sample']]

tbl = read_tsv(cur_tbl_path)
colnames(tbl)[1] = "reference_name"
regions = read_table(regions_path, col_names = c("seqnames", "start", "stop", "name", ".", "strand"))


#turn ranges into GRanges
regions = makeGRangesFromDataFrame(regions, keep.extra.columns=TRUE)
regions$gene = regions$name #add gene column for logic I pulled from previous code

#figure out primary alignments by matches column
tbl = tbl %>% arrange( desc(matches) )
tbl = tbl %>% group_by(query_name) %>% mutate(alignment_n =  row_number() ) %>% mutate(is_primary = ifelse(alignment_n == 1, yes = TRUE, no = FALSE) ) %>% ungroup() #order alignments by matches and number them as primary, secondary, etc.
tbl_ranges = makeGRangesFromDataFrame(tbl, keep.extra.columns = T, 
                                      seqnames.field = "reference_name", 
                                      start.field = "reference_start",end.field = "reference_end", strand.field = "strand")



#get overlaps
overlaps = GenomicRanges::findOverlaps(tbl_ranges, regions)
if( length(overlaps@from) > nrow(tbl) ){
    stop(glue("reads are overlapping multiple regions. Not sure which region to annotate them as. Check table and regions for {cur_sample}.") )
}
#remove reads that don't overlap with : regions.bed
tbl$gene = NA
tbl$gene[overlaps@from] = regions$gene[overlaps@to] #add gene name
tbl = tbl[overlaps@from,] #remove any genes that were not overlaps

#generate first plot
#get plot with all alignments
MIN_p_THRESH = snakemake@config[["min_perID"]]

#plot 1: percentage identity plot for all reads
p1 = ggplot(data = tbl %>% filter(perID_by_events >= MIN_p_THRESH), mapping = aes(x = gene, y = perID_by_events)) + 
  geom_jitter(width = 0.1, height = 0, mapping = aes(color = is_primary), alpha = 0.3) +
  ggtitle(glue("{cur_nhp} {cur_sample} isoseq read primary alignments to nhp reference paralogs.")) +
  xlab(glue("{cur_nhp} TBC1D3 paralogs identified by mapping")) +
  ylab("percent identity by events") +
  scale_color_discrete(name = "is primary alignment") +
  theme(axis.text.x = element_text(size = 8, angle = 45, vjust = 1.2))


#plot 2: primary vs secondary plot
#get stats of secondary vs. primary alignment
prim_vs_secondary = tbl %>% 
  group_by(query_name) %>% 
  arrange(perID_by_events) %>% 
  filter(row_number() %in% c(1,2) ) %>%
  summarize(difference_with_secondary = diff(perID_by_events) ) %>%
  ungroup()
tbl2 = left_join(tbl, prim_vs_secondary, by = "query_name" )
tbl2 = tbl2 %>% mutate(difference_with_secondary = ifelse(is.na(difference_with_secondary), 1, difference_with_secondary)) #make NAs 1
p2 = ggplot(data = tbl2 , mapping = aes(x = gene, y = difference_with_secondary)) + 
  geom_jitter(width = 0.1, height = 0,  mapping = aes(color = is_primary)) +
  ggtitle(glue("{cur_nhp} {cur_sample} isoseq: primary vs secondary alignments %-ID")) +
  xlab(glue("{cur_nhp} TBC1D3 paralogs identified by mapping")) +
  ylab("percent identity difference")


#save plots
plots_list = list(p1, p2)
pdf( snakemake@output[['plt']] , onefile = TRUE, width = 9, height = 6 )
for(i in 1:length(plots_list) ){
  plot(plots_list[[i]])
}
dev.off()
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
from Bio import SeqIO
import os

isoform_paralog_df = pd.read_csv(snakemake.input['tbl'], sep = "\t")

def add_info(cur_record):
    '''add isoform, ORF, and ORF length as features for a given SeqRecord'''
    cur_record.isoform, cur_record.orf = cur_record.description.strip().split(" ")[0].split("_")
    cur_record.length = int([x for x in cur_record.description.strip().split(" ") if "length" in x][0].split(":")[1])
    cur_record.paralog = isoform_paralog_df.loc[isoform_paralog_df['top_isoform']==cur_record.isoform, 'paralog'].iloc[0]
    return cur_record

def process_fasta(in_fasta, out_fasta):
    '''same steps for processing orf fasta and AA fasta: select largest isoform, fix name, print to out_fasta'''
    #load fasta
    records = list(SeqIO.parse(in_fasta, "fasta"))
    records = list(map(add_info, records)) #add orf, isoform, length to each record
    #get largest reading frame for each isoform
    #now go through records and get largest reading frame for an isoform
    largest_orf_isoforms = {}
    for cur_record in records:
        if(cur_record.isoform in list(largest_orf_isoforms.keys())):
            if cur_record.length <= largest_orf_isoforms[cur_record.isoform]['length']:
                continue
        largest_orf_isoforms[cur_record.isoform] = {'orf':cur_record.orf, 'length' : cur_record.length,
                                                'paralog' : cur_record.paralog, 'sequence' : str(cur_record.seq) }
    #write to fasta
    with open(out_fasta, 'w') as new_file:
        for cur_isoform, info_dict in largest_orf_isoforms.items():
            out_name = f"{snakemake.wildcards['SMP']}__{info_dict['paralog']}__{cur_isoform}"
            out_seq = info_dict['sequence']
            new_file.write(f'>{out_name}\n')
            new_file.write(f'{out_seq}\n')

#process ORF fa
process_fasta(snakemake.input['orf_fa'], snakemake.output['orf_fa'])
# Run samtools
exit_code = os.system(f"samtools faidx {snakemake.output['orf_fa']}")
# Check if the command was successful
if exit_code == 0:
    print("orf_fa processed and indexed.")
else:
    print(f"Error. Unable to faidx {snakemake.output['orf_fa']}")

#process AA fa
process_fasta(snakemake.input['aa_fa'], snakemake.output['aa_fa'])
# Run the bash command
exit_code = os.system(f"samtools faidx {snakemake.output['aa_fa']}")
# Check if the command was successful
if exit_code == 0:
    print("aa_fa processed and indexed.")
else:
    print(f"Error. Unable to faidx {snakemake.output['aa_fa']}")
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from Bio import SeqIO
import os

#step 1 : process gff
def read_gff_lines(gff_path):
    '''read in gff lines and skip comment lines'''
    gff_lines = []
    with open( gff_path , 'r') as file:
        for line in file:
            if not line.startswith('#'):
                gff_lines.append(line.rstrip('\n'))
    return gff_lines

gff_lines = read_gff_lines(snakemake.input['gff'])
if(len(gff_lines) == 0):
    print(f"{snakemake.input['gff']} is empty. Opening empty file for {snakemake.output['fa']} and fai files.")
    with open(snakemake.output['fa'], 'w'):
        pass
    with open(snakemake.output['fai'], 'w'):
        pass
    sys.exit() 

#link transcript dict to paralog
par_iso_dict = {}
for line in gff_lines:
    columns = line.strip().split('\t')
    col_9= columns[8].strip().split(";")
    col_9_dict = { x.split("=")[0]:x.split("=")[1] for x in col_9 }
    par_iso_dict[col_9_dict["transcript_id"]] = col_9_dict["paralog"]

#step 2 : process fasta
records = list(SeqIO.parse(snakemake.input['fa'], "fasta"))
#get new name
for record in records:
    transcript_id = record.id.split('_')[0] #ORFs and AA have an ending _ORF_10 that I need to remove
    paralog = par_iso_dict[transcript_id] # 
    new_name = f"{snakemake.wildcards['SMP']}__{paralog}__{transcript_id}"
    record.id = new_name
    record.description = ""
#write file
SeqIO.write(records, snakemake.output['fa'] , "fasta")

#step 3: index new fasta
os.system(f"samtools faidx {snakemake.output['fa']}")

print("Done.")
ShowHide 36 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/Xavster838/downstream_isoseq_process
Name: downstream_isoseq_process
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: MIT License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...