A `snakemake` workflow for building gene trees

public public 1yr ago 0 bookmarks

A snakemake workflow for phylogenomic analysis of genome clusters.

kizuchi is a snakemake workflow for building gene trees, starting from a collection of genomes and HMM profiles, and then analyzing the phylogenetic histories of alleles within genome-level ANI clusters. The aim of this workflow is to automate and document gene tree analysis in a reproducible way, with the ultimate objective of tracing recombination across timescale domains.

Rule Graph

Code Snippets

85
86
87
88
89
shell :
  'rm -rf aggregated_genomes proteins genes \
      hmm_hits hmmdb preani ani ani_clusters \
      scored_proteins cluster_data clusters \
      statistics benchmarks logs'
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
run :
  with open( log[0], 'w' ) as LOG :
    fasta_chunks = batched( input.fastas, size=len(input.fastas)//config['chunks'] )
    if not os.path.exists( 'aggregated_genomes' ) : os.mkdir( 'aggregated_genomes' )
    for n,chunk in enumerate( fasta_chunks ) :
      chunkpath = os.path.join( 'aggregated_genomes',
                                'chunk_{n}.fna'.format(n=str(n) ) ) 
      LOG.write( 'writing {chunk}...\n'.format( chunk=chunkpath ) )
      with open( chunkpath, 'w' ) as f :
        contigs = 0
        for i,genome in enumerate( chunk ) :
          for j,seq in enumerate( parse( open( genome ), 'fasta' ) ) :
            seq.id = Path(genome).stem + '_contig_' + str(j)
            seq.description = ''
            f.write( seq.format('fasta') )
          contigs = contigs + j
        LOG.write( '   wrote {i} genomes with {j} contigs\n'.format( i=str(i+1),
                                                                     j=str(contigs+1) ) )
131
132
wrapper :
  'file://wrappers/prodigal-gv'
144
145
146
147
148
149
150
151
152
run :
  profiles = []
  for hmmfile in input.hmms :
    hmm = hmmreader.read_single( open( hmmfile ) )
    if Path(hmmfile).stem != hmm.metadata.model_name :
      raise Exception( 'file name ({hmmfile}) must match model name ({modelname}).'.format(
                        hmmfile=hmmfile, modelname=hmm.metadata.model_name ) )
    profiles.append( hmm ) 
  hmmwriter.save_many_to_file( hmms=profiles, output=output.db )
168
169
wrapper :
  'file://wrappers/hmmer/hmmsearch'
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
run :
  with open( log[0], 'w' ) as LOG :
    prealignments = { Path(p).stem : { 'prealign' : p,
                                       'hits'     : [] } for p in output.prealign }
    # use trusted cutoffs if provided
    trusted_cutoffs = { m.metadata.model_name : m.metadata.trusted_cutoff
                        for m in hmmreader.read_all( open( input.profile ) ) }
    LOG.write( 'parsed {n} hmms, found {m} trusted cutoffs\n'.format( 
                  n=str(len(trusted_cutoffs)),
                  m=str(sum( [ bool(v) for v in trusted_cutoffs.values() ] ) ) ) )
    # load hmmer hits and protein sequences for each chunk
    for tblout,faa in zip( input.hits, input.faa ) :
      hits = polars.DataFrame( [ h for h in parse_hmmsearch( tblout ) ] )
      seqs = { seq.id : seq for seq in parse( open( faa ), 'fasta' ) }
      chunk_genomes = { name.rsplit('_',1)[0] for name in seqs.keys() }
      LOG.write( '{chunk} : found {h} hits for {p} proteins in {g} genomes\n'.format(
                    chunk=Path(faa).stem,
                    h=str(len(hits)),
                    p=str(len(seqs)),
                    g=str(len(chunk_genomes)) ) )
      if len( hits ) == 0 : continue
      # for each hmm, for each genome, write the proteins 
      # with the top hits in order of their score value
      for genome,hmm in product( hits['tname'].unique(),
                                 hits['qname'].unique() ) :
        if trusted_cutoffs[ hmm ] :
          top_hits = hits.filter( ( polars.col('tname') == genome ) &
                                  ( polars.col('qname') == hmm ) &
                                  ( polars.col('score') >= trusted_cutoff[hmm] ) )\
                         .sort( 'score', descending=True )
        else :
          top_hits = hits.filter( ( polars.col('tname') == genome ) &
                                  ( polars.col('qname') == hmm ) &
                                  ( polars.col('eval') <= EVALUE ) )\
                         .sort( 'score', descending=True )
        if len( top_hits ) == 0 : continue
        with open( prealignments[ hmm ]['prealign'], 'a' ) as f :
          for n,protein in enumerate( top_hits['tname'] ) :
            seq = deepcopy( seqs[ protein ] )
            if seq.seq[-1] == '*' : seq = seq[:-1]
            seq.id = seq.id + '_p' + str(n)
            prealignments[hmm]['hits'].append( seq.id )
            f.write( seq.format( 'fasta' ) )
    for k,p in prealignments.items() :
      genomes = { h.rsplit('_',4)[0] for h in p['hits'] }
      LOG.write( '{hmm} : \n'.format( hmm=k ) )
      LOG.write( '   genomes : {n}\n'.format( n=str(len(genomes)) ) )
      LOG.write( '   hits    : {n}\n'.format( n=str(len(p['hits'])) ) )
      if len( p['hits'] ) == 0 :
        LOG.write( '    NONE\n' )
247
248
249
250
251
252
253
run :
  with open( output.references, 'w' ) as rf :
    for fasta in input.references :
      rf.write( fasta + '\n' )
  with open( output.queries, 'w' ) as qf :
    for fasta in input.queries :
      qf.write( fasta + '\n' )
267
268
wrapper :
  'file://wrappers/fastani'
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
run :
  header = [ 'query', 'reference', 'ANI', 
             'bidirectional fragment mappings',
             'total query fragments' ]

  fastani = pandas.read_csv( input.ani, sep='\t', names=header )

  # file names -> genome names
  fastani['query']     = [ Path(p).stem for p in fastani['query'] ]
  fastani['reference'] = [ Path(p).stem for p in fastani['reference'] ]

  # drop self-hits
  fastani = fastani[ fastani['query'] != fastani['reference'] ]

  # de-duplicate (i.e., take the lower triangle of the matrix)
  fastani['hash'] = fastani.apply( lambda row : hash( 
                                                  tuple(
                                                    sorted( ( row['query'],
                                                              row['reference'] ) ) ) ), 
                                  axis=1 )

  fastani.drop_duplicates( subset='hash', keep='first', inplace=True )
  fastani.drop( ['hash'], axis=1, inplace=True )

  # convert % identity to a distance measure for clustering
  threshold = 1.0 - float( Path( output[0] ).stem.rsplit( '_', 1 )[1] )/100

  with open( output[0], 'w' ) as f :
    genomes = list( set( fastani['query'] ) | set( fastani['reference'] ) )

    D = zeros( ( len(genomes), len(genomes) ) )
    D.fill( 1.0 )

    for i in range(len(genomes)) :
      D[i,i] = 0.0

    for n,row in fastani.iterrows() :
      i = genomes.index( row['query'] )
      j = genomes.index( row['reference'] )
      D[i,j] = D[j,i] = 1.0 - row['ANI']/100

    ani_clusters = defaultdict(list)
    for n,c in enumerate( fcluster( single( squareform(D) ),
                                    threshold,
                                    criterion='distance' ) ) :
      ani_clusters[c].append( genomes[n] )

    for cid, cg in ani_clusters.items() :
      f.write( '{cid}\t{genomes}\n'.format( cid=str(cid), genomes=','.join(cg) ) )
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
run :
  def intersect( hmms, genomes, threshold ) :
    C = defaultdict( set )
    for hmm,genome in zip( hmms, genomes ) :
      C[hmm].add(genome)
    return sum( [ len( C[a] & C[b] ) > threshold
                  for a,b in combinations( C.keys(), 2 ) ] )

  with open( log[0], 'w' ) as LOG :
    LOG.write( 'loading sequences...\n' )

    sequences = { Path(fasta).stem : { rec.id : rec.seq
                                       for rec
                                       in parse( open( fasta ), 'fasta' ) } 
                  for fasta
                  in input.prealignments }

    LOG.write( 'Genes loaded   : {n}\n'.format( n=str(len(sequences)) ) )

    geneseqs = defaultdict(dict)
    for chunk in input.genes :
      for seq in parse( open( chunk ), 'fasta' ) :
        genome = seq.id.rsplit('_',3)[0] 
        geneseqs[ genome ][ seq.id ] = seq

    LOG.write( 'Genomes loaded : {n}\n'.format( n=str(len(geneseqs)) ) )

    LOG.write( 'creating gene table for genome clusters...\n' )

    records = []
    for ani,clusterfile in zip( params.ANIs, input.ani_clusters ) :
      for cid,names in [ line.split() for line in open( clusterfile ) ] :
        names = names.split(',')
        #if not len( names ) > 5 : continue
        for hmm in sequences.keys() :
          seq_ids = [ seq_id for seq_id in sequences[hmm].keys() if seq_id.rsplit('_',4)[0] in names ]
          for seq_id in seq_ids :
            records.append( { 'seq_id' : seq_id,
                              'genome' : seq_id.rsplit('_',4)[0],
                              'cid'    : int( cid ),
                              'hmm'    : hmm,
                              'ANI'    : float( ani ),
                              'id'     : cid + '_' + Path( clusterfile ).stem.split('_')[-1] } )

    df = polars.DataFrame( records )

    LOG.write( 'number of clusters by ANI...\n' )
    for ani,n in df.groupby('ANI').agg( polars.col('cid').unique().count() ).sort('ANI').iter_rows() :
      LOG.write( '    ANI={ani} : {n} clusters\n'.format( ani=ani, n=n ) )

    LOG.write( 'gene table description :\n\n' + str( df.describe() ) + '\n' )

    clusters = df.groupby('id').agg( ['hmm','genome'] ).apply(
                       lambda x : ( x[0],
                         intersect( x[1],
                                    x[2],
                                    MIN_CLUSTER_LINKS ) ) ).filter( polars.col('column_1') > 0 )['column_0']

    LOG.write( 'found {n} genome clusters with at least {T} genomes sharing at least two genes.\n'.format(
               n=len(clusters), T=MIN_CLUSTER_LINKS ) )

    cluster_table = df.filter( polars.col('id').is_in( clusters ) )

    LOG.write( 'filtered gene table description :\n\n' + str( cluster_table.describe() ) )

    cluster_table.write_csv( file=output.cluster_table, separator='\t' )

    # create cluster directory structure and write gene nucleotide
    # sequences as individual fastas per cluster

    LOG.write( 'found {n} clusters.\n'.format( n=len( cluster_table.groupby('id').agg('seq_id') ) ) )

    if not os.path.exists( 'clusters' ) : os.mkdir( 'clusters' )

    for ANI,cids in cluster_table.groupby('ANI').agg( polars.col('cid').unique() ).sort('ANI').iter_rows() :

      anipath = os.path.join( 'clusters', str(ANI) )
      if not os.path.exists( anipath ) : os.mkdir( anipath )

      for cluster_id in cids :

        clusterpath = os.path.join( anipath, str( cluster_id ) )
        if not os.path.exists( clusterpath ) : os.mkdir( clusterpath )
        genepath = os.path.join( clusterpath, 'genes' )
        if not os.path.exists( genepath ) : os.mkdir( genepath )

        for hmm,seq_ids,genomes in cluster_table.filter( ( polars.col('ANI') == ANI ) & ( polars.col('cid') == cluster_id ) ).groupby('hmm').agg( ['seq_id', 'genome'] ).iter_rows() :
          with open( os.path.join( genepath, hmm + '.fna' ), 'w' ) as f :
            for seq_id,genome in zip( seq_ids, genomes ) :
              orig_seq_id = seq_id.rsplit('_',1)[0] # remove hmmer evalue rank
              seq = geneseqs[genome][orig_seq_id] 
              seq.id = seq_id
              f.write( seq.format( 'fasta' ) )
ShowHide 7 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/ryneches/kizuchi
Name: kizuchi
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: BSD 3-Clause "New" or "Revised" License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...