Code Snippets

Analysis code for the TAP-seq manuscript.

# extract cis enhancer - gene pairs, remove significant pairs that increase gene expression
cis_enh_perts <- results %>%
  filter(enh_type == "cis", abs(dist_to_tss) >= 1000) %>%
  filter(!(significant == "sig" & manual_lfc > 0))

# only retain hits within the same target region (discard out of region control enhancers) and add
# "chr" to enhancer chromosome for use with ENCODE data
cis_enh_perts <- cis_enh_perts %>%
  filter(enh_chr == sub("Chr", "", sample)) %>%
  mutate(enh_chr = paste0("chr", enh_chr),
         gene_chr = paste0("chr", gene_chr))

# get enhancer coordinates and convert to GRanges object
enh_coords <- cis_enh_perts %>%
  select(enh_chr, enh_start, enh_end, perturbation) %>%
  distinct() %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE) %>%
  `names<-`(.$perturbation)  # set names to perturbation id

# load encode chip-seq data
chip_data <- lapply(chip_files, FUN = import, which = range(enh_coords), format = "bigWig")

R Markdown bigWig From line 59 of analyses/chromatin_analyses_screen.Rmd

ChIP-seq analysis pipeline used in Bragdon et. al. 2022. (v1)

import argparse
import numpy as np
import pyBigWig as pybw

def main(chip_in="non-depleted-Rpb1-IP-3_Rpb1-chipseq-spikenorm-midpoints_smoothed.bw",
         input_in="non-depleted-untagged-input-3_Rpb1-chipseq-spikenorm-midpoints_smoothed.bw",
         ratio_out="ratio.bw"):
    chip = pybw.open(chip_in)
    input = pybw.open(input_in)
    ratio = pybw.open(ratio_out, "w")

    assert chip.chroms() == input.chroms(), "ChIP and input bigWig chromosomes don't match."

    ratio.addHeader(list(chip.chroms().items()))

    for chrom in chip.chroms():
        chip_values = chip.values(chrom, 0, chip.chroms(chrom), numpy=True)
        input_values = input.values(chrom, 0, chip.chroms(chrom), numpy=True)
        ratio.addEntries(chrom, 0, values=np.log2(np.divide(chip_values, input_values)), span=1, step=1)

    chip.close()
    input.close()
    ratio.close()

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Given two bigWig coverage files, generate a coverage file of their log2 ratio.')
    parser.add_argument('-c', dest='chip_in', type=str, help='Path to numerator (ChIP) bigWig.')
    parser.add_argument('-i', dest='input_in', type=str, help='Path to denominator (input) bigWig.')
    parser.add_argument('-o', dest='ratio_out', type=str, help='Path to output bigWig.')
    args = parser.parse_args()
    main(chip_in=args.chip_in,
         input_in=args.input_in,
         ratio_out=args.ratio_out)

Python numpy bigWig From line 3 of scripts/make_ratio_bigwig.py

muChip Analysis Workflow: Trackhub Creation and Sample Visualization

from pathlib import PurePath
import colorsys

import trackhub

def get_color(i, n_colors):
    h =  i/float(n_colors)
    s = 0.5 # color_intensity[filetype]
    color = [255*v for v in colorsys.hsv_to_rgb(h, 0.5, 0.5)]
    return ",".join([str(int(c)) for c in list(color)])

def histone_track(name, color="200,0,0"):
    composite = trackhub.CompositeTrack(
        name=name+"_composite",
        short_label=name,
        tracktype='bigWig',
        visibility='full',
        color=color
    )
    signal_view = trackhub.ViewTrack(
        name=name+"_signal",
        view='signal',
        visibility='full',
        tracktype='bigWig',
        short_label=name+'_signal',
        autoScale="on"
    )
    regions_view = trackhub.ViewTrack(
        name=name+'_region',
        view='regions',
        visibility='dense',
        tracktype='bigWig',
        short_label=name+'_regions')

    composite.add_view(signal_view)
    composite.add_view(regions_view)

    for signal_type in ["treat_pileup", "control_lambda"]:
        track = trackhub.Track(
            tracktype='bigWig',
            name=name+"_"+signal_type,
            url="%s_%s.bw" % (name, signal_type),
            short_label=signal_type,
            autoScale="on"
        )
        signal_view.add_tracks(track)

    for region_type in ["domains"]:
        track = trackhub.Track(
            name=name+"_"+region_type,
            url="%s_%s.bb" %(name, region_type),
            short_label=region_type,
            tracktype='bigBed')
        regions_view.add_tracks(track)
    return composite

names = [PurePath(name).stem.replace("_domains", "") for name in snakemake.input.domains]
colors = [get_color(i, len(names)) for i in range(len(names))]
hub, genomes_file, genome, trackdb = trackhub.default_hub(
    hub_name="testing",
    genome="hg38",
    email="[email protected]")
trackdb.add_tracks([histone_track(*pair) for pair in zip(names, colors)])
open(snakemake.output[0], "w").write(str(trackdb))

Python bigWig trackhub From line 1 of scripts/trackhub.py

Reference alignment workflow (v0.1)

hub = """
hub gene-conversion
shortLabel gene-conversion
longLabel gene-conversion
genomesFile genomes.txt
email mvollger.edu
"""

genomes = """
genome {ref}
trackDb trackDb.txt
"""

track_db_header = """
track gene-conversion
compositeTrack off
shortLabel gene-conversion
longLabel gene-conversion
visibility hide
type bigBed 9 +
itemRgb on
maxItems 100000
filter.score 5:1000
filterByRange.score on
filterLimits.score 0:1000
filterLabel.score Minimum decrease in mismatches
"""

track = """
    track g-c-{sm}
    parent gene-conversion
    bigDataUrl gene-conversion/{sm}.bb
    shortLabel {sm} gc D/A
    longLabel {sm} gene conversion
    type bigBed 9 +
    itemRgb on
    priority {pri}
    visibility dense
"""


track_db_interact_header = """
track interact-gene-conversion
compositeTrack off
shortLabel interact-gc
longLabel gene conversion interactions
visibility hide
type bigInteract
maxItems 100000
filter.score 5:1000
filterByRange.score on
filterLimits.score 0:1000
"""

track_interact = """
    track interact-g-c-{sm}
    parent gene-conversion
    bigDataUrl gene-conversion/{sm}.interact.bb
    shortLabel {sm} gc
    longLabel {sm} gene conversion interactions
    type bigInteract
    maxHeightPixels 100:30:5
    priority {pri2}
    visibility full
"""

track_super = """
track gene-conversion-by-sample
superTrack on show
shortLabel gc-by-sample
longLabel gene conversion by sample
filter.score 5:1000
filterByRange.score on
filterLimits.score 0:1000

"""

track_comp = """
    track {sm}
    parent gene-conversion-by-sample
    compositeTrack on
    shortLabel {sm}-gc
    longLabel {sm} gene conversion
    type bigWig
    visibility full

        track gc-{sm}
        parent {sm}
        bigDataUrl gene-conversion/{sm}.bb
        shortLabel {sm} gc
        longLabel {sm} gene conversion
        type bigBed 9 +
        itemRgb on
        visibility dense

        track interact-{sm}
        parent {sm}
        bigDataUrl gene-conversion/{sm}.interact.bb
        shortLabel {sm} interact
        longLabel {sm} interactions
        type bigInteract
        maxHeightPixels 100:30:5
        visibility full

"""


all_tracks = """
track g-c-interact
bigDataUrl all_candidate_interactions.bb
shortLabel all gc interact
longLabel all gene conversion interactions
type bigInteract
visibility hide

track Donor 
bigDataUrl all_candidate_windows_donor.bw
shortLabel Donor 
longLabel Donor
type bigWig
color 211,144,0
autoScale on
visibility full

track Acceptor 
bigDataUrl all_candidate_windows_acceptor.bw
shortLabel Acceptor 
longLabel Acceptor
type bigWig
color 0,127,211
autoScale on
visibility full

track gene-conversion-windows
bigDataUrl all_candidate_windows.bb
shortLabel all g-c windows
longLabel all gene conversion windows
type bigBed 9 +
itemRgb on
visibility dense
maxItems 100000

"""


view_format_super = """
# SuperTrack declaration no type or visibility is required
# However "show" is needed to have a superTrack on by  default
track gene-conversion
longLabel gene conversion for HPRC samples
superTrack on show
shortLabel gene-conversion

"""
view_format_comp = """
    # Composite declaration, usually composite tracks are all of one type,
    #   and the type can be declared.
    # When a mixed type (some bigBeds, some bigWigs) you need to use the unusual
    #   'type bed 3' declaration.
    # The subGroup1 line will define groups,
    #   in this case the special 'view' group
    #   (a new subGroup2 could be metadata)
    # Later individual tracks will identify what 'subGroups id' they belong to.
    track gene-conversion-by-sample
    type bed 3
    longLabel gene conversion by sample
    parent gene-conversion
    compositeTrack on
    shortLabel gc-by-sample
    visibility full
    subGroup1 view Views bb=Colored_bigBed_items int=Interact_Data bg=BigBedGraph_items

"""
view_fromat_bb = """
        # This is the unexpected part about views,
        #    you need a separate parent to group the view
        # So this new view-specific stanza with "view id"
        #    can collect all tracks with some visibility settings
        track gene-conversion-by-sample-bb
        parent gene-conversion-by-sample on
        view bb
        visibility pack
        itemRgb on 
        maxItems 100000
        filter.score 5:1000
        filterByRange.score on
        filterLimits.score 0:1000

"""
view_format_bb_sm = """
            # Child bigBed in this view
            # The 'subGroups view=bb' shares this track belongs in a view,
            #    even though a parent declaration is also needed
            # All these tracks should be the same type of data
            track gene-conversion-by-sample-bb-{sm}
            parent gene-conversion-by-sample-bb
            type bigBed 9 +
            longLabel {sm} gene conversion bb 
            bigDataUrl gene-conversion/{sm}.bb
            shortLabel {sm}-gc-bb
            subGroups view=bb

"""
view_format_int = """
        # New View Stanza that collects all interact in this composite
        # This declares related bigInteract tracks    
        track gene-conversion-by-sample-interact
        parent gene-conversion-by-sample on
        view int
        visibility full
        maxHeightPixels 100:55:5
        maxItems 100000
        filter.score 5:1000
        filterByRange.score on
        filterLimits.score 0:1000

"""
view_format_int_sm = """
            # Child one Interact
            track gene-conversion-by-sample-interact-{sm}
            parent gene-conversion-by-sample-interact
            type bigInteract
            longLabel {sm} gene conversion interactions
            bigDataUrl gene-conversion/{sm}.interact.bb
            shortLabel {sm}-gc-interact
            subGroups view=int

"""

view_format_bg = """
        track gene-conversion-by-sample-bg
        parent gene-conversion-by-sample on
        view bg
        visibility full
        maxHeightPixels 100:10:5
        maxItems 100000
"""

view_format_bg_sm = """
            track gene-conversion-by-sample-bg-{sm}
            parent gene-conversion-by-sample-bg
            longLabel {sm} gene conversion bg
            bigDataUrl gene-conversion/{sm}.bg
            shortLabel {sm}-gc-bg
            subGroups view=bg
            autoScale Off
            graphTypeDefault Bar
            gridDefault OFF
            windowingFunction Mean
            color 175,4,4
            altColor 47,79,79
            viewLimits 0:5
            type bigWig 0 1000
"""

with open(snakemake.output.track, "w") as out:
    out.write(all_tracks)
    if False:
        out.write(track_db_header)
        # out.write(track_db_interact_header)
        [
            out.write(
                (track + track_interact).format(
                    sm=sm, pri=2 * idx + 1, pri2=2 * idx + 2
                )
            )  # pri=idx + 1, pri2=idx + 2))
            for idx, sm in enumerate(snakemake.params.samples)
        ]
    elif True:
        out.write(view_format_super)
        out.write(view_format_comp)
        # big beds
        out.write(view_fromat_bb)
        [out.write(view_format_bb_sm.format(sm=sm)) for sm in snakemake.params.samples]
        # bigInteract
        out.write(view_format_int)
        [out.write(view_format_int_sm.format(sm=sm)) for sm in snakemake.params.samples]
        # bedGraph
        out.write(view_format_bg)
        [out.write(view_format_bg_sm.format(sm=sm)) for sm in snakemake.params.samples]
    else:
        out.write(track_super)
        [out.write(track_comp.format(sm=sm)) for sm in snakemake.params.samples]

open(snakemake.output.hub, "w").write(hub)

ref = snakemake.wildcards.ref
if "CHM13_V1.1" in ref:
    print("changing ref")
    ref = "t2t-chm13-v1.1"
open(snakemake.output.genomes, "w").write(genomes.format(ref=ref))

Python bigWig From line 1 of scripts/track_hub.py

Snakemake workflow: biseps

import sys, math, multiprocessing, subprocess, os

# Usage: python3 bismark_to_bigwig_pe.py [-keep] [-sort] [-all] [-L=labels] [-p=num_proc] <chrm_sizes>  <allC_file> [allC_file]*

# NOTE: allc file contains the methylation information for all chromosomes

# Steps:
# 1. allC to bedGraph
# 2. sort bedGraph if necessary
# 3. bedGraph to BigWig
# 4. remove temporary files

NUMPROC=1

def processInputs( allCFileAr, chrmFileStr, keepTmp, labelsAr, outID, numProc, isSort, useAll ):
	print( 'Keep temp files: {:s}; Sort bedGraph: {:s}; Use all positions: {:s}'.format( str( keepTmp), str (isSort), str(useAll)))
	print( 'Begin processing files with {:d} processors'.format( numProc ) )
	pool = multiprocessing.Pool( processes=numProc )
	results = [ pool.apply_async( processFile, args=(allCFileAr[i], chrmFileStr, labelsAr[i], outID, keepTmp, isSort, useAll) ) for i in range(len(allCFileAr)) ]
	suc = [ p.get() for p in results ]

	print( 'Done' )


def processFile( allCFileStr, chrmFileStr, label, outID, keepTmp, isSort, useAll ):
	if outID == None and label == None:
		outID = allCFileStr.replace( '.tsv','' ).replace( 'allc_','' )
	elif outID == None:
		outID = label
	elif label == None:
		outID += '_' + allCFileStr.replace( '.tsv','' ).replace( 'allc_','' )
	else:
		outID += '_' + label

	print( 'Reading allC file {:s}'.format( allCFileStr ) )
	# allC to bedGraphs
	bedGraphStr =  outID + '.bedGraph'
	bedGraphAr = [bedGraphStr + '.' + x for x in ['cg','chg','chh'] ]
	readAllC( allCFileStr, bedGraphAr, useAll )

	if isSort:
		print( 'Sorting bedGraph files' )
		for b in bedGraphAr:
			sortBedFile( b )

	print( 'Converting {:s} files to BigWig'.format(bedGraphStr ) )
	# bedGraph to bigWig
	for b in bedGraphAr:
		processBedGraph( b, chrmFileStr )

	# remove temporary
	if keepTmp == False:
		print( 'Removing temporary files...' )
		for b in bedGraphAr:
			os.remove( b )
	print( 'BigWig finished for {:s}.bw.*'.format( outID ) )

def readAllC( allCFileStr, outFileAr, useAll ):

	allCFile = open( allCFileStr, 'r' )
	outFileAr = [open( outFileStr, 'w' ) for outFileStr in outFileAr]

	mTypes = [ 'CG', 'CHG', 'CHH' ]

	for line in allCFile:
		lineAr = line.rstrip().split('\t')
		# (0) chr (1) pos (2) strand (3) mC (4) C (5) Cctxt
		# (6) trintctxt
		if len(lineAr) < 7:
			continue
		elif ( useAll or int(lineAr[3])> 0 ):
			chrm = lineAr[0]
			pos = int( lineAr[1] ) - 1
			methType = decodeMethType( lineAr[5] )
			try:
				mInd = mTypes.index( methType )
			except ValueError:
				continue
			value = float( lineAr[3] ) / (float( lineAr[4] ) + float( lineAr[3]))
			# adjust for negative strand
			if lineAr[2] == '-':
				value = value * -1

			# (0) chrm (1) start (2) end (3) value
			outStr = '{:s}\t{:d}\t{:d}\t{:f}\n'.format( chrm, pos, pos+1, value )
			outFile = outFileAr[mInd]
			outFile.write( outStr )
		# end if
	# end for
	allCFile.close()
	[ outFile.close() for outFile in outFileAr ]

def decodeMethType( mStr ):

	if mStr.startswith( 'CG' ):
		return 'CG'
	elif mStr.endswith( 'G' ):
		return 'CHG'
	elif mStr == 'CNN':
		return 'CNN'
	else:
		return 'CHH'

def sortBedFile( bedFileStr ):
	command = 'bedSort {:s} {:s}'.format( bedFileStr, bedFileStr )
	subprocess.call( command, shell=True )

def processBedGraph( bedGraphStr, chrmFileStr ):

	bigWigStr = bedGraphStr.replace( '.bedGraph', '.bw' )
	#print( bigWigStr )
	# bedGraphToBigWig in.bedGraph chrom.sizes out.bw
	command = 'bedGraphToBigWig {:s} {:s} {:s}'.format( bedGraphStr, chrmFileStr, bigWigStr )
	subprocess.call( command, shell=True)


def parseInputs( argv ):
	# Usage: python3 bismark_to_bigwig_pe.py [-keep] [-sort] [-all] [-L=labels] [-p=num_proc] <chrm_sizes>  <allC_file> [allC_file]*

	keepTmp = False
	labelsAr = []
	numProc = NUMPROC
	isSort = False
	useAll = False
	outID = None
	startInd = 0

	for i in range( min(5, len(argv)-2) ):
		if argv[i] == '-keep':
			keepTmp = True
			startInd += 1
		elif argv[i] == '-sort':
			isSort = True
			startInd += 1
		elif argv[i] == '-all':
			useAll = True
			startInd += 1
		elif argv[i].startswith( '-L=' ):
			labelsAr = argv[i][3:].split( ',' )
			startInd += 1
		elif argv[i].startswith( '-o=' ):
			outID = argv[i][3:]
			startInd += 1
		elif argv[i].startswith( '-p=' ):
			try:
				numProc = int( argv[i][3:] )
				startInd += 1
			except ValueError:
				print( 'ERROR: number of processors must be an integer' )
				exit()
		elif argv[i] in [ '-h', '--help', '-help']:
			printHelp()
			exit()
		elif argv[i].startswith( '-' ):
			print( 'ERROR: {:s} is not a valid parameter'.format( argv[i] ) )
			exit()

	chrmFileStr = argv[startInd]
	allCFileAr = []
	for j in range(startInd+1, len(argv) ):
		allCFileAr += [ argv[j] ]

	if len(labelsAr) == 0:
		labelsAr = [None] * len(allCFileAr)
	elif len(labelsAr) != len(allCFileAr):
		print( "ERROR: number of labels doesn't match number of allC files" )
		exit()

	processInputs( allCFileAr, chrmFileStr, keepTmp, labelsAr, outID, numProc, isSort, useAll )

def printHelp():
	print ("Usage: python3 bismark_to_bigwig_pe.py [-keep] [-sort] [-all] [-L=labels] [-o=out_id] [-p=num_proc] <chrm_sizes>  <bismark_file> [bismark_file]*")
	print( 'Converts bismark files to context-specific BigWig files' )
	print( 'Note: bedGraphToBigWig and bedSort programs must be in the path' )
	print( 'Required:' )
	print( 'chrm_file\ttab-delimited file with chromosome names and lengths,\n\t\ti.e. fasta index file' )
	print( 'bismark_file\tbismark file with all chrms and contexts' )
	print( 'Optional:' )
	print( '-keep\t\tkeep intermediate files' )
	print( '-sort\t\tcalls bedSort; add this option if bigwig conversion fails' )
	print( '-all\t\tuse all covered positions including 0s [default only includes mC > 1]' )
	print( '-L=labels\tcomma-separated list of labels to use for the allC files;\n\t\tdefaults to using information from the allc file name' )
	print( '-o=out_id\toptional identifier to be added to the output file names' )
	print( '-p=num_proc\tnumber of processors to use [default 1]' )

if __name__ == "__main__":
	if len(sys.argv) < 3 :
		printHelp()
	else:
		parseInputs( sys.argv[1:] )

Python bigWig bedGraphToBigWig Bismark From line 1 of scripts/bismark_to_bigwig_pe.py

Workflow Steps and Code Snippets

Analysis code for the TAP-seq manuscript.

ChIP-seq analysis pipeline used in Bragdon et. al. 2022. (v1)

muChip Analysis Workflow: Trackhub Creation and Sample Visualization

Reference alignment workflow (v0.1)

Snakemake workflow: biseps

bigWig