Workflow Steps and Code Snippets
5 tagged steps and code snippets that match keyword bigWig
Analysis code for the TAP-seq manuscript.
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | # extract cis enhancer - gene pairs, remove significant pairs that increase gene expression cis_enh_perts <- results %>% filter(enh_type == "cis", abs(dist_to_tss) >= 1000) %>% filter(!(significant == "sig" & manual_lfc > 0)) # only retain hits within the same target region (discard out of region control enhancers) and add # "chr" to enhancer chromosome for use with ENCODE data cis_enh_perts <- cis_enh_perts %>% filter(enh_chr == sub("Chr", "", sample)) %>% mutate(enh_chr = paste0("chr", enh_chr), gene_chr = paste0("chr", gene_chr)) # get enhancer coordinates and convert to GRanges object enh_coords <- cis_enh_perts %>% select(enh_chr, enh_start, enh_end, perturbation) %>% distinct() %>% makeGRangesFromDataFrame(keep.extra.columns = TRUE) %>% `names<-`(.$perturbation) # set names to perturbation id # load encode chip-seq data chip_data <- lapply(chip_files, FUN = import, which = range(enh_coords), format = "bigWig") |
ChIP-seq analysis pipeline used in Bragdon et. al. 2022. (v1)
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | import argparse import numpy as np import pyBigWig as pybw def main(chip_in="non-depleted-Rpb1-IP-3_Rpb1-chipseq-spikenorm-midpoints_smoothed.bw", input_in="non-depleted-untagged-input-3_Rpb1-chipseq-spikenorm-midpoints_smoothed.bw", ratio_out="ratio.bw"): chip = pybw.open(chip_in) input = pybw.open(input_in) ratio = pybw.open(ratio_out, "w") assert chip.chroms() == input.chroms(), "ChIP and input bigWig chromosomes don't match." ratio.addHeader(list(chip.chroms().items())) for chrom in chip.chroms(): chip_values = chip.values(chrom, 0, chip.chroms(chrom), numpy=True) input_values = input.values(chrom, 0, chip.chroms(chrom), numpy=True) ratio.addEntries(chrom, 0, values=np.log2(np.divide(chip_values, input_values)), span=1, step=1) chip.close() input.close() ratio.close() if __name__ == "__main__": parser = argparse.ArgumentParser(description='Given two bigWig coverage files, generate a coverage file of their log2 ratio.') parser.add_argument('-c', dest='chip_in', type=str, help='Path to numerator (ChIP) bigWig.') parser.add_argument('-i', dest='input_in', type=str, help='Path to denominator (input) bigWig.') parser.add_argument('-o', dest='ratio_out', type=str, help='Path to output bigWig.') args = parser.parse_args() main(chip_in=args.chip_in, input_in=args.input_in, ratio_out=args.ratio_out) |
muChip Analysis Workflow: Trackhub Creation and Sample Visualization
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | from pathlib import PurePath import colorsys import trackhub def get_color(i, n_colors): h = i/float(n_colors) s = 0.5 # color_intensity[filetype] color = [255*v for v in colorsys.hsv_to_rgb(h, 0.5, 0.5)] return ",".join([str(int(c)) for c in list(color)]) def histone_track(name, color="200,0,0"): composite = trackhub.CompositeTrack( name=name+"_composite", short_label=name, tracktype='bigWig', visibility='full', color=color ) signal_view = trackhub.ViewTrack( name=name+"_signal", view='signal', visibility='full', tracktype='bigWig', short_label=name+'_signal', autoScale="on" ) regions_view = trackhub.ViewTrack( name=name+'_region', view='regions', visibility='dense', tracktype='bigWig', short_label=name+'_regions') composite.add_view(signal_view) composite.add_view(regions_view) for signal_type in ["treat_pileup", "control_lambda"]: track = trackhub.Track( tracktype='bigWig', name=name+"_"+signal_type, url="%s_%s.bw" % (name, signal_type), short_label=signal_type, autoScale="on" ) signal_view.add_tracks(track) for region_type in ["domains"]: track = trackhub.Track( name=name+"_"+region_type, url="%s_%s.bb" %(name, region_type), short_label=region_type, tracktype='bigBed') regions_view.add_tracks(track) return composite names = [PurePath(name).stem.replace("_domains", "") for name in snakemake.input.domains] colors = [get_color(i, len(names)) for i in range(len(names))] hub, genomes_file, genome, trackdb = trackhub.default_hub( hub_name="testing", genome="hg38", email="[email protected]") trackdb.add_tracks([histone_track(*pair) for pair in zip(names, colors)]) open(snakemake.output[0], "w").write(str(trackdb)) |
Reference alignment workflow (v0.1)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 | hub = """ hub gene-conversion shortLabel gene-conversion longLabel gene-conversion genomesFile genomes.txt email mvollger.edu """ genomes = """ genome {ref} trackDb trackDb.txt """ track_db_header = """ track gene-conversion compositeTrack off shortLabel gene-conversion longLabel gene-conversion visibility hide type bigBed 9 + itemRgb on maxItems 100000 filter.score 5:1000 filterByRange.score on filterLimits.score 0:1000 filterLabel.score Minimum decrease in mismatches """ track = """ track g-c-{sm} parent gene-conversion bigDataUrl gene-conversion/{sm}.bb shortLabel {sm} gc D/A longLabel {sm} gene conversion type bigBed 9 + itemRgb on priority {pri} visibility dense """ track_db_interact_header = """ track interact-gene-conversion compositeTrack off shortLabel interact-gc longLabel gene conversion interactions visibility hide type bigInteract maxItems 100000 filter.score 5:1000 filterByRange.score on filterLimits.score 0:1000 """ track_interact = """ track interact-g-c-{sm} parent gene-conversion bigDataUrl gene-conversion/{sm}.interact.bb shortLabel {sm} gc longLabel {sm} gene conversion interactions type bigInteract maxHeightPixels 100:30:5 priority {pri2} visibility full """ track_super = """ track gene-conversion-by-sample superTrack on show shortLabel gc-by-sample longLabel gene conversion by sample filter.score 5:1000 filterByRange.score on filterLimits.score 0:1000 """ track_comp = """ track {sm} parent gene-conversion-by-sample compositeTrack on shortLabel {sm}-gc longLabel {sm} gene conversion type bigWig visibility full track gc-{sm} parent {sm} bigDataUrl gene-conversion/{sm}.bb shortLabel {sm} gc longLabel {sm} gene conversion type bigBed 9 + itemRgb on visibility dense track interact-{sm} parent {sm} bigDataUrl gene-conversion/{sm}.interact.bb shortLabel {sm} interact longLabel {sm} interactions type bigInteract maxHeightPixels 100:30:5 visibility full """ all_tracks = """ track g-c-interact bigDataUrl all_candidate_interactions.bb shortLabel all gc interact longLabel all gene conversion interactions type bigInteract visibility hide track Donor bigDataUrl all_candidate_windows_donor.bw shortLabel Donor longLabel Donor type bigWig color 211,144,0 autoScale on visibility full track Acceptor bigDataUrl all_candidate_windows_acceptor.bw shortLabel Acceptor longLabel Acceptor type bigWig color 0,127,211 autoScale on visibility full track gene-conversion-windows bigDataUrl all_candidate_windows.bb shortLabel all g-c windows longLabel all gene conversion windows type bigBed 9 + itemRgb on visibility dense maxItems 100000 """ view_format_super = """ # SuperTrack declaration no type or visibility is required # However "show" is needed to have a superTrack on by default track gene-conversion longLabel gene conversion for HPRC samples superTrack on show shortLabel gene-conversion """ view_format_comp = """ # Composite declaration, usually composite tracks are all of one type, # and the type can be declared. # When a mixed type (some bigBeds, some bigWigs) you need to use the unusual # 'type bed 3' declaration. # The subGroup1 line will define groups, # in this case the special 'view' group # (a new subGroup2 could be metadata) # Later individual tracks will identify what 'subGroups id' they belong to. track gene-conversion-by-sample type bed 3 longLabel gene conversion by sample parent gene-conversion compositeTrack on shortLabel gc-by-sample visibility full subGroup1 view Views bb=Colored_bigBed_items int=Interact_Data bg=BigBedGraph_items """ view_fromat_bb = """ # This is the unexpected part about views, # you need a separate parent to group the view # So this new view-specific stanza with "view id" # can collect all tracks with some visibility settings track gene-conversion-by-sample-bb parent gene-conversion-by-sample on view bb visibility pack itemRgb on maxItems 100000 filter.score 5:1000 filterByRange.score on filterLimits.score 0:1000 """ view_format_bb_sm = """ # Child bigBed in this view # The 'subGroups view=bb' shares this track belongs in a view, # even though a parent declaration is also needed # All these tracks should be the same type of data track gene-conversion-by-sample-bb-{sm} parent gene-conversion-by-sample-bb type bigBed 9 + longLabel {sm} gene conversion bb bigDataUrl gene-conversion/{sm}.bb shortLabel {sm}-gc-bb subGroups view=bb """ view_format_int = """ # New View Stanza that collects all interact in this composite # This declares related bigInteract tracks track gene-conversion-by-sample-interact parent gene-conversion-by-sample on view int visibility full maxHeightPixels 100:55:5 maxItems 100000 filter.score 5:1000 filterByRange.score on filterLimits.score 0:1000 """ view_format_int_sm = """ # Child one Interact track gene-conversion-by-sample-interact-{sm} parent gene-conversion-by-sample-interact type bigInteract longLabel {sm} gene conversion interactions bigDataUrl gene-conversion/{sm}.interact.bb shortLabel {sm}-gc-interact subGroups view=int """ view_format_bg = """ track gene-conversion-by-sample-bg parent gene-conversion-by-sample on view bg visibility full maxHeightPixels 100:10:5 maxItems 100000 """ view_format_bg_sm = """ track gene-conversion-by-sample-bg-{sm} parent gene-conversion-by-sample-bg longLabel {sm} gene conversion bg bigDataUrl gene-conversion/{sm}.bg shortLabel {sm}-gc-bg subGroups view=bg autoScale Off graphTypeDefault Bar gridDefault OFF windowingFunction Mean color 175,4,4 altColor 47,79,79 viewLimits 0:5 type bigWig 0 1000 """ with open(snakemake.output.track, "w") as out: out.write(all_tracks) if False: out.write(track_db_header) # out.write(track_db_interact_header) [ out.write( (track + track_interact).format( sm=sm, pri=2 * idx + 1, pri2=2 * idx + 2 ) ) # pri=idx + 1, pri2=idx + 2)) for idx, sm in enumerate(snakemake.params.samples) ] elif True: out.write(view_format_super) out.write(view_format_comp) # big beds out.write(view_fromat_bb) [out.write(view_format_bb_sm.format(sm=sm)) for sm in snakemake.params.samples] # bigInteract out.write(view_format_int) [out.write(view_format_int_sm.format(sm=sm)) for sm in snakemake.params.samples] # bedGraph out.write(view_format_bg) [out.write(view_format_bg_sm.format(sm=sm)) for sm in snakemake.params.samples] else: out.write(track_super) [out.write(track_comp.format(sm=sm)) for sm in snakemake.params.samples] open(snakemake.output.hub, "w").write(hub) ref = snakemake.wildcards.ref if "CHM13_V1.1" in ref: print("changing ref") ref = "t2t-chm13-v1.1" open(snakemake.output.genomes, "w").write(genomes.format(ref=ref)) |
Snakemake workflow: biseps
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 | import sys, math, multiprocessing, subprocess, os # Usage: python3 bismark_to_bigwig_pe.py [-keep] [-sort] [-all] [-L=labels] [-p=num_proc] <chrm_sizes> <allC_file> [allC_file]* # NOTE: allc file contains the methylation information for all chromosomes # Steps: # 1. allC to bedGraph # 2. sort bedGraph if necessary # 3. bedGraph to BigWig # 4. remove temporary files NUMPROC=1 def processInputs( allCFileAr, chrmFileStr, keepTmp, labelsAr, outID, numProc, isSort, useAll ): print( 'Keep temp files: {:s}; Sort bedGraph: {:s}; Use all positions: {:s}'.format( str( keepTmp), str (isSort), str(useAll))) print( 'Begin processing files with {:d} processors'.format( numProc ) ) pool = multiprocessing.Pool( processes=numProc ) results = [ pool.apply_async( processFile, args=(allCFileAr[i], chrmFileStr, labelsAr[i], outID, keepTmp, isSort, useAll) ) for i in range(len(allCFileAr)) ] suc = [ p.get() for p in results ] print( 'Done' ) def processFile( allCFileStr, chrmFileStr, label, outID, keepTmp, isSort, useAll ): if outID == None and label == None: outID = allCFileStr.replace( '.tsv','' ).replace( 'allc_','' ) elif outID == None: outID = label elif label == None: outID += '_' + allCFileStr.replace( '.tsv','' ).replace( 'allc_','' ) else: outID += '_' + label print( 'Reading allC file {:s}'.format( allCFileStr ) ) # allC to bedGraphs bedGraphStr = outID + '.bedGraph' bedGraphAr = [bedGraphStr + '.' + x for x in ['cg','chg','chh'] ] readAllC( allCFileStr, bedGraphAr, useAll ) if isSort: print( 'Sorting bedGraph files' ) for b in bedGraphAr: sortBedFile( b ) print( 'Converting {:s} files to BigWig'.format(bedGraphStr ) ) # bedGraph to bigWig for b in bedGraphAr: processBedGraph( b, chrmFileStr ) # remove temporary if keepTmp == False: print( 'Removing temporary files...' ) for b in bedGraphAr: os.remove( b ) print( 'BigWig finished for {:s}.bw.*'.format( outID ) ) def readAllC( allCFileStr, outFileAr, useAll ): allCFile = open( allCFileStr, 'r' ) outFileAr = [open( outFileStr, 'w' ) for outFileStr in outFileAr] mTypes = [ 'CG', 'CHG', 'CHH' ] for line in allCFile: lineAr = line.rstrip().split('\t') # (0) chr (1) pos (2) strand (3) mC (4) C (5) Cctxt # (6) trintctxt if len(lineAr) < 7: continue elif ( useAll or int(lineAr[3])> 0 ): chrm = lineAr[0] pos = int( lineAr[1] ) - 1 methType = decodeMethType( lineAr[5] ) try: mInd = mTypes.index( methType ) except ValueError: continue value = float( lineAr[3] ) / (float( lineAr[4] ) + float( lineAr[3])) # adjust for negative strand if lineAr[2] == '-': value = value * -1 # (0) chrm (1) start (2) end (3) value outStr = '{:s}\t{:d}\t{:d}\t{:f}\n'.format( chrm, pos, pos+1, value ) outFile = outFileAr[mInd] outFile.write( outStr ) # end if # end for allCFile.close() [ outFile.close() for outFile in outFileAr ] def decodeMethType( mStr ): if mStr.startswith( 'CG' ): return 'CG' elif mStr.endswith( 'G' ): return 'CHG' elif mStr == 'CNN': return 'CNN' else: return 'CHH' def sortBedFile( bedFileStr ): command = 'bedSort {:s} {:s}'.format( bedFileStr, bedFileStr ) subprocess.call( command, shell=True ) def processBedGraph( bedGraphStr, chrmFileStr ): bigWigStr = bedGraphStr.replace( '.bedGraph', '.bw' ) #print( bigWigStr ) # bedGraphToBigWig in.bedGraph chrom.sizes out.bw command = 'bedGraphToBigWig {:s} {:s} {:s}'.format( bedGraphStr, chrmFileStr, bigWigStr ) subprocess.call( command, shell=True) def parseInputs( argv ): # Usage: python3 bismark_to_bigwig_pe.py [-keep] [-sort] [-all] [-L=labels] [-p=num_proc] <chrm_sizes> <allC_file> [allC_file]* keepTmp = False labelsAr = [] numProc = NUMPROC isSort = False useAll = False outID = None startInd = 0 for i in range( min(5, len(argv)-2) ): if argv[i] == '-keep': keepTmp = True startInd += 1 elif argv[i] == '-sort': isSort = True startInd += 1 elif argv[i] == '-all': useAll = True startInd += 1 elif argv[i].startswith( '-L=' ): labelsAr = argv[i][3:].split( ',' ) startInd += 1 elif argv[i].startswith( '-o=' ): outID = argv[i][3:] startInd += 1 elif argv[i].startswith( '-p=' ): try: numProc = int( argv[i][3:] ) startInd += 1 except ValueError: print( 'ERROR: number of processors must be an integer' ) exit() elif argv[i] in [ '-h', '--help', '-help']: printHelp() exit() elif argv[i].startswith( '-' ): print( 'ERROR: {:s} is not a valid parameter'.format( argv[i] ) ) exit() chrmFileStr = argv[startInd] allCFileAr = [] for j in range(startInd+1, len(argv) ): allCFileAr += [ argv[j] ] if len(labelsAr) == 0: labelsAr = [None] * len(allCFileAr) elif len(labelsAr) != len(allCFileAr): print( "ERROR: number of labels doesn't match number of allC files" ) exit() processInputs( allCFileAr, chrmFileStr, keepTmp, labelsAr, outID, numProc, isSort, useAll ) def printHelp(): print ("Usage: python3 bismark_to_bigwig_pe.py [-keep] [-sort] [-all] [-L=labels] [-o=out_id] [-p=num_proc] <chrm_sizes> <bismark_file> [bismark_file]*") print( 'Converts bismark files to context-specific BigWig files' ) print( 'Note: bedGraphToBigWig and bedSort programs must be in the path' ) print( 'Required:' ) print( 'chrm_file\ttab-delimited file with chromosome names and lengths,\n\t\ti.e. fasta index file' ) print( 'bismark_file\tbismark file with all chrms and contexts' ) print( 'Optional:' ) print( '-keep\t\tkeep intermediate files' ) print( '-sort\t\tcalls bedSort; add this option if bigwig conversion fails' ) print( '-all\t\tuse all covered positions including 0s [default only includes mC > 1]' ) print( '-L=labels\tcomma-separated list of labels to use for the allC files;\n\t\tdefaults to using information from the allc file name' ) print( '-o=out_id\toptional identifier to be added to the output file names' ) print( '-p=num_proc\tnumber of processors to use [default 1]' ) if __name__ == "__main__": if len(sys.argv) < 3 : printHelp() else: parseInputs( sys.argv[1:] ) |
format / edam
bigWig
bigWig format for large sequence annotation tracks that consist of a value for each sequence position. Similar to textual WIG format.