# # Test script for Synchrony GMQL in Python # # Wong Limsoon # 19 Nov 2020 # from aggriterators import OpG # Aggregate function objects. from simplebed import ( # Constructor for a BED region Bed, # Constructor for a BED EFile OnDiskBedEFile, TransientBedEFile, BedEFile, # Endow genometric predicates on a pair of Bed objects BedPred, # Genometic predicates. Take BedPred objects as input SameChrom, SameStrand, SameLoci, EndAfter, EndBefore, EndTogether, StartAfter, StartBefore, StartTogether, IsBefore, Inside, Enclose, Outside, Touch, Overlap, DL, DLE, DG, DGE, GenPred) from samples import ( # Constructor for a sample Sample, # Constructor for a sample EFile TransientSampleEFile, OnDiskSampleEFile, OnDiskAltSampleEFile) from bedfileops import ( # BFOut is a class providing output functions # listed below to be used with joinR. # - left, right, intersect, both, cat: (Bed, Bed) -> list[Bed] BFOut, # BFOrder is a class providing # ordering and "cover" functions # listed below for combining output regions. # - plus, minus, null: BedEIterator -> TransientBedEFile # - withStrand: (BedIterator, BedEIterator -> BedEFile) -> OnDiskBedEFile # - histo, summit: (int -> bool) -> BedEIterator -> TransientBedEFile # - cover, flat: (int -> bool) -> BedEIterator -> TransientBedEFile # - complement: BedEIterator -> TransientBedEFile BFOrder, # BFCover is a class providing functions # listed below for selecting regions in "cover" functions. # - atleast, atmost, exactly: int -> int -> bool # - between: (int, int) -> int -> bool BFCover, # BFOps is a class providing the GMQL # query functions on regions. # - selectR, projectR, extendR, partitionByLociR, partitionByR, # - mapR, differenceR, joinR, coverR, histoR, summitR, flatR, complementR BFOps) from samplefileops import ( # Output functions to be used with joinS # - left, right, overwrite: (Sample, Sample) -> Sample SFOut, # The GMQL query functions on samples # - onRegion, # - selectS, projectS, extendS, groupByS, # - mapS, differenceS, joinS, coverS SFOps) import samplefileops samplefileops.MATERIALISE # MATERIALISE = False - always lazy-stream output # MATERIALISE = True - always serialize output to disk from encode import ( # Converts the bedFile object to ENCODE NP for all samples # - toEncodeNPSampleEFile: SampleEFile -> SampleEFile toEncodeNPSampleEFile, # Import a Stefano-formatted sample file to Limsoon's format. # - importEncodeNPSampleEFile( # dbpath: str, dblist: str, newname: str) -> OnDiskSampleEFile importEncodeNPSampleEFile) # # Ok, let's read a sample file to play. # ctcf = importEncodeNPSampleEFile( "tmp/ctcf/files", "tmp/ctcf-list.txt", "ctcf") # After the import, we can read the "ctcf" file imported # in future using OnDiskSampleEFile() # ctcf = OnDiskSampleEFile("ctcf") cfct[0].meta() # metadata of sample 0 ctcf[0].tf # the metadata 'tf' of sample 0 [s.meta() for s in ctcf[1:3]] # metadata of samples 1 and 2. # # merge BED file of sample 2 and 3. # then replace the BED file of sample 0 with this. # mm = ctcf[0].bedFileUpdated( ctcf[2].bedFile.mergedWith( ctcf[3].bedFile)) mb = mm.bedFile.serialized() mb[0].dict() # mb[0].dict() is no longer ctcf[0].dict() ctcf[0].bedFile[0].dict() # it has become either ctcf[2].bedFilep[0].dict() ctcf[2].bedFile[0].dict() # or ctcf[3].bedFile[0].dict(), depending on ctcf[3].bedFile[0].dict() # which one is before the other. mb[0].peak # meta attributes and region attributes mb[0].chromStart # can be accessed directly w/o going thru ctcf[0].cell # meta() and dict() # # Before example queries are given, let's first review the # structure of samples and their tracks (genomic regions.) # Conceptually, a GMQL sample file can be thought of as a # simple 1-level nested relation, # # EFile[Sample(l1, .., # ln, # bedFile: EFile[Bed(chrom, # chromStart, # chromEnd, # strand, # name # score, # r1, ..., rm))])] # where l1, .., ln are metadata attributes of a sample; # r1, .., rm are metadata attributes of a genomic # region (a row in a BED file.) # # EFile can be thought of as a "file-based vector", # which transparently materializes the parts of the # vector of samples and BED rows into memory when # they are needed in a computation. # # The main query operations provided by GMQL are mirrored # after the relational algebra (SELECT, PROJECT, JOIN, # GROUPBY) on samples (ignoring the track attribute; i.e. # the BED file); aggregate functions on the track attribute; # and if we regard the track attribute/BED file as a # relational table, GMQL provides also relational algebra # operations (SELECT, PROJECT, JOIN, GROUPBY) on the # track/BED file, as well as operations with specialized # genomic meaning (MAP, DIFFERENCE, and COVER) on the # track/BED file. # # # GMQL select # # SELECT query on regions (i.e. BED file rows), selectR ... # # onRegion(f)(db) applies the function f:BedEFile -> BedEFile # to every sample in db: SampleEFile. # # selectR(p)(us) is roughly the following, pseudo SQL query # on us: BedEFile, # # SELECT u.* from us u WHERE p(u) # q2 = SFOps.onRegion( BFOps.selectR( lambda r: r.chrom == "chr3") )(ctcf) for s in q2: print("***", s.sid) for r in s.bedFile[0:3]: print(r.dict()) # # query on sample, selectS ... # # selectS(f)(db) is roughly the following pseudo SQL query # on db:SampleEFile, # # SELECT s.* FROM db s WHERE f(s) # # That is, it selects samples in db satisfying the predicate # f:Sample -> bool # q1a = SFOps.selectS( onSample = lambda s: len(s.bedFile) > 50000 )(ctcf) q1b = SFOps.selectS( lambda s: len(s.bedFile) > 50000 )(ctcf) for s in ctcf: print(s.sid, len(s.bedFile)) for s in q1a: print(s.sid, len(s.bedFile)) for s in q1b: print(s.sid, len(s.bedFile)) # # GMQL extend # # # query on samples, extendS ... # # extendS({ l1 : f1, .., ln : fn })(db) is this pseudo SQL query # on db:SampleEFile, # # SELECT s.*, l1 as f1(s), .., ln as fn(s) FROM db s # # The functions f:Sample -> A are arbitrary functions on samples. # The l are strings to be used as new metadata names. # q3a = SFOps.extendS({ "bedcount": lambda s: len(s.bedFile), "mysid": "sid" } )(ctcf) for s in q3a: print(s.meta()) # # query on samples, projectS ... # # db.projectS({ l1 : f1, .., ln : fn }) is this pseudo SQL query # on db:SampleEFile, # # SELECT l1 as f1(s), .., ln as fn(s) FROM db s # # The functions f:Sample->A are arbitrary functions on samples. # The l are strings to be used as new metadata names. # q3b = SFOps.projectS({ "bedcount": lambda s: len(s.bedFile), "mysid": "sid"} )(ctcf) for s in q3b: print(s.meta()) # # If samplefileops.MATERIALISE = True, # q3b is automatically materialized on disk. So when q3b is read back, # an extra field sid is added to record # the file names of the BED files. samplefileops.MATERIALISE = False q3c = SFOps.projectS({ "bedcount": lambda s: len(s.bedFile), "mysid": "sid"} )(ctcf) for s in q3c: print(s.meta()) # # q3c is same as q3b, but with samplefileops.MATERIALISE = False. # i.e., lazy-streaming q3c instead of materializing on disk. # So no extra sid field is added. Check this by comparing with q3b. samplefileops.MATERIALISE = True # # GMQL map # # # query on samples and regions, mapS and mapR. # # mapS(f, g, joinby)(db1, db2) is this pseudo SQL query # on db1:SampleEFile and db2:SampleEFile, # # SELECT g(s, t).bedFileUpdated(f(s.BedFile, t.BedFile)) # FROM db1 s, db2 t # WHERE joinby(s,t) # # g: (Sample, Sample) -> Sample, # f: (BedEFile, BedEFile) -> BedEFile, # joinby: [(Sample, Sample) -> bool] # # can be any functions of the right type. In particular, # f:(BedEFile,BedEFile) -> BedEFile can be a mapR query on # BED files. # # mapR({ k1 : a1, .., kn : an })(ctr)(us, vs) is roughly # this pseudo SQL query on us:BedEFile and vs:BedEFile, # # SELECT u.*, ctr as COUNT, k1 as A1, .., kn as An # FROM us u, vs v # WHERE u overlaps v # GROUPBY locus of u # WITH A1 = a1 applied to the group, .., # An = an applied to the group, # COUNT is size of the group # # ctr, k1, ..,kn are Strings to be used as names of new metadata; # ctr is implicit and can be omitted. # a:Aggr[Bed,float] are aggregate functions on BedFile; # common aggregate functions can be imported from the OpG package. # mapres2 = SFOps.mapS( onRegion = BFOps.mapR({ "avgscore": OpG.average(lambda r: r.score), "stats": OpG.stats(lambda r: r.score) }) )(ctcf, ctcf) # OpG.average(f) and OpG.stats(f) takes a function f:Bed->float/int. # f can be replaced by a str; in this case, it is taken as a # Bed attribute name. See mapres2a below. mapres2a = SFOps.mapS( onRegion = BFOps.mapR({ "avgscore": OpG.average("score"), "stats": OpG.stats("score") }) )(ctcf, ctcf) mapres2a[0].bedFile[0].dict() mapres3 = SFOps.mapS( onRegion = BFOps.mapR({ "avgscore": OpG.average("score") }), joinby = "sid" )(ctcf, ctcf) for s in mapres3: print("***", s.meta()) for r in s.bedFile[0:3]: print(r.dict()) # # GMQL groupby # # # query on samples, groupbyS ... # # groupByS( g, { l1 : a1, .., ln : an })(db) is roughly this pseudo # SQL query on db:SampleEFile, # # SELECT s.*, group as s.g, l1 as A1, .., ln as An # FROM db s # GROUPBY s.g # WITH A1 = results of aggregate function a1 on a group, .., # An = results of aggregate function an on a group # # g:str is a sample attribute to be used for grouping samples in db. # l:str are names of new metadata (the aggregate function results.) # a:Aggr[Sample,Double] are aggregate functions on samples. # grpbysid = SFOps.groupByS( grp = "sid", aggrs = { "count": OpG.count() } )(mapres3) print(grpbysid.slurped()) grpbysid[0].bedFile[2].dict() # # query on Bed files, partitionbyR ... # # partitionbyR(g, { k1 : a1, .., kn : an })(us) is roughly this # pseudo SQL query on us:BedEFile, # # SELECT u.chrom, u.chromStart, u.chromEnd, u.g, k1 as A1, .., kn as An # FROM us u # GROUPBY u.chrom u.chromStart, u.chromEnd, u.g # WITH A1 = a1 applied to the group, .., # An = an applied to the group. # # g:str is an optional Bed attribute to be used, # in addition to loci, for grouping entries in Bed file. # k:str are names of new metadate (the aggregate function results.) # a:AGGR[Bed,Double] are aggregate functions to be applied on a group. # grpbylocus = SFOps.onRegion( BFOps.partitionByLociR(aggrs = { "minscore": OpG.smallest("score"), "regcount": OpG.count() }) )(mapres3) grpbylocus[0].bedFile[2].dict() # # GMQL difference # # of samples' tracks/regions, differenceS and differenceR ... # # differenceS(joinby, exact)(db1, db2) is roughly this pseudo # SQL query on db1:SampleEFile and db2:SampleEFile, # # SELECT s.*, # differenceR(exact)( # s.bedFile, # SELECT t.BedFile FROM db2 t WHERE joinby(s, t)) # FROM db1 s # # differenceR(exact)(us, d1, .., dn) is this pseudo SQL query on # us:BedEFile, d1:BedEFile, ...dn:BedEFile, # # SELECT u.* # FROM us u # WHERE exact && locus of u is not in any of d1, .., dn # OR (!exact) && locus of u does not overlap any region in d1, .., dn. # diffdb = SFOps.differenceS()( TransientSampleEFile([ctcf[0]]), TransientSampleEFile( ctcf[1:])) diffdb[0].bedFile[9].dict() len(diffdb[0].bedFile) len(ctcf[0].bedFile) # # GMQL join # # # query on samples and regions, joinS and joinR ... # # db1.joinS(db2)(f, g, joinby) is roughly this pseudo SQL query # on db1:SampleFile and db2:SampleFile, # # SELECT g(s, t).bedFileUpdated(f(s.BedFile, t.BedFile)) # FROM db1 s, db2 t # WHERE joinby(s,t) # # g: (Sample, Sample) => Sample, # f: (BedFile, BedFile) => BedFile, # joinby: (Sample, Sample) => Booleabn # # can be any functions of the right type. In particular, # f:(BedFile,BedFile)=>BedFile can be the joinR function # for a join query on BED files. # # joinR(ycs1, .., ncs1, ..)(output, screen)(us, vs) # is this pseudo SQL query on us:BedFile and vs:BedFile, # # SELECT output(u, v) # FROM us u, vs v # WHERE ycs1(v, u) && ycs2(v, u) && .. # ncs1(v, u) && ncs2(v, u) && .. # screen(u, v) # # output: (Bed, Bed) -> [Bed], # screen: (Bed, Bed) -> bool, # ycs, ncs: LocusPred # # can be any functions of the right type. In particular, # ycs - convex genometric predicates (e.g. DLE(k), DL(k), Overlap(k)); # ncs - non-convex genometric predicates (e.g. DGE(k), DG(k)); # output - BFOut.{ both, left, right, cat, intersect } # joindb = SFOps.joinS( onRegion = BFOps.joinR(geno = [DGE(5000), DLE(100000)]) )(ctcf, ctcf) for s in joindb[0:3]: print ("***", s.meta()) for r in s.bedFile[0:3]: print(r.dict()) # # GMQL cover # histo = SFOps.coverS( onRegion = BFOps.histoR(BFCover.atleast(1)) )(ctcf) print(histo.slurped()) for r in histo[0].bedFile[0:5]: print (r.dict()) cover = SFOps.coverS( onRegion = BFOps.coverR(BFCover.atleast(1)) )(ctcf) print(cover.slurped()) for r in cover[0].bedFile[0:5]: print (r.dict()) summit = SFOps.coverS( onRegion = BFOps.summitR(BFCover.atleast(1)) )(ctcf) print(summit.slurped()) for r in summit[0].bedFile[0:5]: print (r.dict()) flat = SFOps.coverS( onRegion = BFOps.flatR(BFCover.atleast(1)) )(ctcf) print(flat.slurped()) for r in flat[0].bedFile[0:5]: print (r.dict()) complement = SFOps.coverS( onRegion = BFOps.complementR() )(ctcf) print(complement.slurped()) for r in complement[0].bedFile[0:5]: print (r.dict()) #----------------------------------------------- # # "Object-oriented" version of the same queries # #----------------------------------------------- from samplefileops import SampleFile, OnDiskSampleFile, TransientSampleFile # OnDiskSampleFile and TransientSampleFile is OnDiskSampleEFile # and TransientSampleEFile endowed with methods for selectS, projectS, ... from encode import importEncodeNPSampleFile # import Stefano-formatted ENCODE NP sample files # into an OnDiskSampleFile import time # this module is needed for timing studies def timing(f): start = time.time() f() end = time.time() print("*** time: ", end - start, " seconds") ctcf = importEncodeNPSampleFile( "tmp/ctcf/files", "tmp/ctcf-list.txt", "ctcf") # You only need to import once. Subsequently, you can simply # load the imported file like this: # ctcf = OnDiskSampleFile("ctcf") ctcf[0].meta() ctcf[0].tf # # Now can do selectS, projectS, etc. in "object-oriented" style... # def q2(): return ctcf.onRegion(BFOps.selectR(lambda r: r.chrom == "chr3")) timing(q2) for s in q2(): print("***", s.sid) for r in s.bedFile[0:3]: print(r.dict()) def q1a(): return ctcf.selectS(onSample = lambda s: len(s.bedFile) > 50000) timing(q1a) for s in ctcf: print(s.sid, len(s.bedFile)) for s in q1a(): print(s.sid, len(s.bedFile)) def q3a(): return ctcf.extendS({ "bedcount": lambda s: len(s.bedFile), "mysid": "sid" }) timing(q3a) for s in q3a: print(s.meta()) q3b = ctcf.projectS({ "bedcount": lambda s: len(s.bedFile), "mysid": "sid" }) for s in q3b: print(s.meta()) def mapres3q(): return ctcf.mapS( others = ctcf, onRegion = BFOps.mapR({ "avgscore": OpG.average("score"), "stats": OpG.stats("score") }), joinby = "sid") timing(mapres3q) mapres3 = mapres3q() mapres3[0].bedFile[0].dict() grpbysid = mapres3.groupByS( grp = "sid", aggrs = { "count": OpG.count() }) grpbysid[0].bedFile[2].dict() grpbylocus = mapres3.onRegion(BFOps.partitionByLociR( aggrs = { "minscore": OpG.smallest("score"), "regcount": OpG.count() })) grpbylocus[0].bedFile[2].dict() def diffdb(): return TransientSampleFile([ctcf[0]]).differenceS( others = TransientSampleFile(ctcf[1:]), exact = False) timing(diffdb) diffdb()[0].bedFile[9].dict() def joindb(): return ctcf.joinS( others = ctcf, onRegion = BFOps.joinR(geno = [DGE(5000), DLE(100000)])) timing(joindb) for s in joindb()[0:2]: print("***", s.meta()) for r in s.bedFile[0:2]: print(r.dict()) histo = ctcf.coverS(onRegion = BFOps.histoR(BFCover.atleast(2))) for r in histo[0].bedFile[0:5]: print(r.dict())