#
# Test script for Synchrony GMQL in Python
#
# Wong Limsoon
# 19 Nov 2020
#


from aggriterators import OpG
  # Aggregate function objects.

from simplebed import (
  # Constructor for a BED region 
  Bed,

  # Constructor for a BED EFile 
  OnDiskBedEFile, TransientBedEFile, BedEFile,

  # Endow genometric predicates on a pair of Bed objects
  BedPred,

  # Genometic predicates. Take BedPred objects as input
  SameChrom, SameStrand, SameLoci,
  EndAfter, EndBefore, EndTogether, StartAfter, StartBefore, StartTogether,
  IsBefore, Inside, Enclose, Outside, Touch, Overlap,
  DL, DLE, DG, DGE,
  GenPred)
  
from samples import (
  # Constructor for a sample 
  Sample, 

  # Constructor for a sample EFile 
  TransientSampleEFile,
  OnDiskSampleEFile, OnDiskAltSampleEFile) 

from bedfileops import (
  # BFOut is a class providing output functions
  # listed below to be used with joinR. 
  # - left, right, intersect, both, cat: (Bed, Bed) -> list[Bed]
  BFOut, 

  # BFOrder is a class providing
  # ordering and "cover" functions 
  # listed below for combining output regions.
  # - plus, minus, null: BedEIterator -> TransientBedEFile  
  # - withStrand: (BedIterator, BedEIterator -> BedEFile) -> OnDiskBedEFile
  # - histo, summit: (int -> bool) -> BedEIterator -> TransientBedEFile
  # - cover, flat: (int -> bool) -> BedEIterator -> TransientBedEFile
  # - complement: BedEIterator -> TransientBedEFile
  BFOrder, 

  # BFCover is a class providing functions 
  # listed below for selecting regions in "cover" functions.
  # - atleast, atmost, exactly: int -> int -> bool
  # - between: (int, int) -> int -> bool
  BFCover, 

  # BFOps is a class providing the GMQL 
  # query functions on regions.
  # - selectR, projectR, extendR, partitionByLociR, partitionByR,
  # - mapR, differenceR, joinR, coverR, histoR, summitR, flatR, complementR
  BFOps)

from samplefileops import (
  # Output functions to be used with joinS 
  # - left, right, overwrite: (Sample, Sample) -> Sample
  SFOut, 

  # The GMQL query functions on samples
  # - onRegion,
  # - selectS, projectS, extendS, groupByS,
  # - mapS, differenceS, joinS, coverS
  SFOps)

import samplefileops 
samplefileops.MATERIALISE
  # MATERIALISE = False - always lazy-stream output
  # MATERIALISE = True  - always serialize output to disk


from encode import (
  # Converts the bedFile object to ENCODE NP for all samples
  # - toEncodeNPSampleEFile: SampleEFile -> SampleEFile
  toEncodeNPSampleEFile,

  # Import a Stefano-formatted sample file to Limsoon's format.
  # - importEncodeNPSampleEFile(
  #       dbpath: str, dblist: str, newname: str) -> OnDiskSampleEFile
  importEncodeNPSampleEFile)


#
# Ok, let's read a sample file to play.
#

ctcf = importEncodeNPSampleEFile(
         "tmp/ctcf/files",
         "tmp/ctcf-list.txt",
         "ctcf")

# After the import,  we can read the "ctcf" file imported
# in future using OnDiskSampleEFile() 
# ctcf = OnDiskSampleEFile("ctcf")

cfct[0].meta()                # metadata of sample 0
ctcf[0].tf                    # the metadata 'tf' of sample 0
[s.meta() for s in ctcf[1:3]] # metadata of samples 1 and 2.


#
# merge BED file of sample 2 and 3. 
# then replace the BED file of sample 0 with this.
#

mm = ctcf[0].bedFileUpdated(
       ctcf[2].bedFile.mergedWith(
         ctcf[3].bedFile))

mb = mm.bedFile.serialized()

mb[0].dict()                # mb[0].dict() is no longer ctcf[0].dict()
ctcf[0].bedFile[0].dict()   # it has become either ctcf[2].bedFilep[0].dict()
ctcf[2].bedFile[0].dict()   # or ctcf[3].bedFile[0].dict(), depending on
ctcf[3].bedFile[0].dict()   # which one is before the other.

mb[0].peak                  # meta attributes and region attributes
mb[0].chromStart            # can be accessed directly w/o going thru
ctcf[0].cell                # meta() and dict()


#
#  Before example queries are given, let's first review the
# structure of samples and their tracks (genomic regions.)
# Conceptually, a GMQL sample file can be thought of as a
# simple 1-level nested relation,
# 
# EFile[Sample(l1, ..,
#              ln,
#              bedFile: EFile[Bed(chrom,
#                                 chromStart,
#                                 chromEnd,
#                                 strand,
#                                 name
#                                 score,
#                                 r1, ..., rm))])]
# where l1, .., ln are metadata attributes of a sample;
#       r1, .., rm are metadata attributes of a genomic 
#                    region (a row in a BED file.)
# 
# EFile can be thought of as a "file-based vector",
# which transparently materializes the parts of the
# vector of samples and BED rows into memory when
# they are needed in a computation.
# 
# The main query operations provided by GMQL are mirrored
# after the relational algebra (SELECT, PROJECT, JOIN,
# GROUPBY) on samples (ignoring the track attribute; i.e.
# the BED file); aggregate functions on the track attribute;
# and if we regard the track attribute/BED file as a
# relational table, GMQL provides also relational algebra
# operations (SELECT, PROJECT, JOIN, GROUPBY) on the
# track/BED file, as well as operations with specialized
# genomic meaning (MAP, DIFFERENCE, and COVER) on the
# track/BED file.
#


#
# GMQL select
#

#   SELECT query on regions (i.e. BED file rows), selectR ...
#
#   onRegion(f)(db) applies the function f:BedEFile -> BedEFile
#   to every sample in db: SampleEFile.
#
#   selectR(p)(us) is roughly the following, pseudo SQL query
#   on us: BedEFile,
#   
#   	SELECT u.* from us u WHERE p(u)
#   

q2 = SFOps.onRegion(
       BFOps.selectR(
          lambda r: r.chrom == "chr3")
     )(ctcf)

for s in q2:
    print("***", s.sid)
    for r in s.bedFile[0:3]:
        print(r.dict())


#
#   query on sample, selectS ...
#
#   selectS(f)(db) is roughly the following pseudo SQL query
#   on db:SampleEFile,
#   
#   	SELECT s.* FROM db s WHERE f(s)
#   
#   That is, it selects samples in db satisfying the predicate
#   f:Sample -> bool
#   

q1a = SFOps.selectS(
        onSample = lambda s: len(s.bedFile) > 50000
      )(ctcf)

q1b = SFOps.selectS(
        lambda s: len(s.bedFile) > 50000
      )(ctcf)

for s in ctcf: print(s.sid, len(s.bedFile))

for s in q1a: print(s.sid, len(s.bedFile))

for s in q1b: print(s.sid, len(s.bedFile))


#
# GMQL extend
#


#
#   query on samples, extendS ...
#  
#   extendS({ l1 : f1, .., ln : fn })(db) is this pseudo SQL query
#   on db:SampleEFile,
#   
#   	SELECT s.*, l1 as f1(s), .., ln as fn(s) FROM db s
#   
#   The functions f:Sample -> A are arbitrary functions on samples.
#   The l are strings to be used as new metadata names.
#   


q3a = SFOps.extendS({
        "bedcount": lambda s: len(s.bedFile),
        "mysid": "sid" }
      )(ctcf)

for s in q3a: print(s.meta())


#
#   query on samples, projectS ...
#
#   db.projectS({ l1 : f1, .., ln : fn }) is this pseudo SQL query
#   on db:SampleEFile,
#   
#   SELECT l1 as f1(s), .., ln as fn(s) FROM db s
#   
#   The functions f:Sample->A are arbitrary functions on samples.
#   The l are strings to be used as new metadata names.
#   


q3b = SFOps.projectS({
        "bedcount": lambda s: len(s.bedFile),
        "mysid": "sid"}
      )(ctcf)
        
for s in q3b: print(s.meta())
#
# If samplefileops.MATERIALISE = True,
# q3b is automatically materialized on disk. So when q3b is read back,
# an extra field sid is added to record # the file names of the BED files.


samplefileops.MATERIALISE = False

q3c = SFOps.projectS({
            "bedcount": lambda s: len(s.bedFile),
            "mysid": "sid"}
          )(ctcf)
        
for s in q3c: print(s.meta())
#
# q3c is same as q3b, but with samplefileops.MATERIALISE = False.
# i.e., lazy-streaming q3c instead of materializing on disk.
# So no extra sid field is added. Check this by comparing with q3b.

samplefileops.MATERIALISE = True


#
# GMQL map
#


#
#   query on samples and regions, mapS and mapR.
#   
#   mapS(f, g, joinby)(db1, db2) is this pseudo SQL query
#   on db1:SampleEFile and db2:SampleEFile,
#   
#   	SELECT g(s, t).bedFileUpdated(f(s.BedFile, t.BedFile))
#   	FROM db1 s, db2 t
#   	WHERE joinby(s,t)
#   
#   g: (Sample, Sample) -> Sample,
#   f: (BedEFile, BedEFile) -> BedEFile,
#   joinby: [(Sample, Sample) -> bool]
#   
#   can be any functions of the right type. In particular,
#   f:(BedEFile,BedEFile) -> BedEFile can be a mapR query on
#   BED files.
#   
#   mapR({ k1 : a1, .., kn : an })(ctr)(us, vs) is roughly
#   this pseudo SQL query on us:BedEFile and vs:BedEFile,
#   
#   	SELECT u.*, ctr as COUNT, k1 as A1, .., kn as An
#   	FROM us u, vs v
#   	WHERE u overlaps v
#   	GROUPBY locus of u
#   	WITH A1 = a1 applied to the group, ..,
#   	An = an applied to the group,
#   	COUNT is size of the group
#   
#   ctr, k1, ..,kn are Strings to be used as names of new metadata;
#   ctr is implicit and can be omitted.
#   a:Aggr[Bed,float] are aggregate functions on BedFile;
#   common aggregate functions can be imported from the OpG package.
#   

mapres2 = SFOps.mapS(
            onRegion = BFOps.mapR({
               "avgscore": OpG.average(lambda r: r.score),
               "stats": OpG.stats(lambda r: r.score) })
          )(ctcf, ctcf)

# OpG.average(f) and OpG.stats(f) takes a function f:Bed->float/int.
# f can be replaced by a str; in this case, it is taken as a
# Bed attribute name. See mapres2a below.

mapres2a = SFOps.mapS(
            onRegion = BFOps.mapR({
               "avgscore": OpG.average("score"),
               "stats": OpG.stats("score") })
          )(ctcf, ctcf)

mapres2a[0].bedFile[0].dict()


mapres3 = SFOps.mapS(
             onRegion = BFOps.mapR({ "avgscore": OpG.average("score") }),
             joinby = "sid"
          )(ctcf, ctcf)

for s in mapres3:
  print("***", s.meta())
  for r in s.bedFile[0:3]:
      print(r.dict())


#
# GMQL groupby
#


#
#   query on samples, groupbyS ...
#   
#   groupByS( g, { l1 : a1, .., ln : an })(db) is roughly this pseudo
#   SQL query on db:SampleEFile,
#   
#   	SELECT s.*, group as s.g, l1 as A1, .., ln as An
#   	FROM db s
#   	GROUPBY s.g
#   	WITH A1 = results of aggregate function a1 on a group, ..,
#   	     An = results of aggregate function an on a group
#   
#   g:str is a sample attribute to be used for grouping samples in db.
#   l:str are names of new metadata (the aggregate function results.)
#   a:Aggr[Sample,Double] are aggregate functions on samples.
#   

grpbysid = SFOps.groupByS(
             grp = "sid",
             aggrs = { "count": OpG.count() }
           )(mapres3)

print(grpbysid.slurped())

grpbysid[0].bedFile[2].dict()


#
#   query on Bed files, partitionbyR ...
#   
#   partitionbyR(g, { k1 : a1, .., kn : an })(us) is roughly this
#   pseudo SQL query on us:BedEFile,
#   
#   	SELECT u.chrom, u.chromStart, u.chromEnd, u.g, k1 as A1, .., kn  as An
#   	FROM us u
#   	GROUPBY u.chrom u.chromStart, u.chromEnd, u.g
#   	WITH A1 = a1 applied to the group, ..,
#   	     An = an applied to the group.
#   
#   g:str is an optional Bed attribute to be used,
#   in addition to loci, for grouping entries in Bed file.
#   k:str are names of new metadate (the aggregate function results.)
#   a:AGGR[Bed,Double] are aggregate functions to be applied on a group.
#   

grpbylocus = SFOps.onRegion(
               BFOps.partitionByLociR(aggrs = {
                 "minscore": OpG.smallest("score"),
                 "regcount": OpG.count() })
             )(mapres3)

grpbylocus[0].bedFile[2].dict()


#
# GMQL difference
#


#   of samples' tracks/regions, differenceS and differenceR ...
#  
#  differenceS(joinby, exact)(db1, db2) is roughly this pseudo
#  SQL query on db1:SampleEFile and db2:SampleEFile,
#  
#  	SELECT s.*,
#  	       differenceR(exact)(
#	          s.bedFile,
#  	          SELECT t.BedFile FROM db2 t WHERE joinby(s, t))
#  	FROM db1 s
#  
#  differenceR(exact)(us, d1, .., dn) is this pseudo SQL query on
#  us:BedEFile, d1:BedEFile, ...dn:BedEFile,
#  
#  	SELECT u.*
#  	FROM us u
#  	WHERE exact && locus of u is not in any of d1, .., dn
#  	OR (!exact) && locus of u does not overlap any region in d1, .., dn.
#  


diffdb = SFOps.differenceS()(
           TransientSampleEFile([ctcf[0]]),
           TransientSampleEFile( ctcf[1:]))

diffdb[0].bedFile[9].dict()

len(diffdb[0].bedFile)

len(ctcf[0].bedFile)


#
# GMQL join
#


#
#   query on samples and regions, joinS and joinR ...
#   
#   db1.joinS(db2)(f, g, joinby) is roughly this pseudo SQL query
#   on db1:SampleFile and db2:SampleFile,
#   
#   SELECT g(s, t).bedFileUpdated(f(s.BedFile, t.BedFile))
#   FROM db1 s, db2 t
#   WHERE joinby(s,t)
#   
#   g: (Sample, Sample) => Sample,
#   f: (BedFile, BedFile) => BedFile,
#   joinby: (Sample, Sample) => Booleabn
#   
#   can be any functions of the right type. In particular,
#   f:(BedFile,BedFile)=>BedFile can be the joinR function
#   for a join query on BED files.
#   
#   joinR(ycs1, .., ncs1, ..)(output, screen)(us, vs)
#   is this pseudo SQL query on us:BedFile and vs:BedFile,
#   
#   	SELECT output(u, v)
#   	FROM us u, vs v
#   	WHERE ycs1(v, u) && ycs2(v, u) && ..
#   		ncs1(v, u) && ncs2(v, u) && ..
#   		screen(u, v)
#
#   output: (Bed, Bed) -> [Bed],
#   screen: (Bed, Bed) -> bool,
#   ycs, ncs: LocusPred
#   
#   can be any functions of the right type. In particular,
#   ycs - convex genometric predicates (e.g. DLE(k), DL(k), Overlap(k));
#   ncs - non-convex genometric predicates (e.g. DGE(k), DG(k));
#   output - BFOut.{ both, left, right, cat, intersect }
#

joindb = SFOps.joinS(
           onRegion = BFOps.joinR(geno = [DGE(5000), DLE(100000)])
         )(ctcf, ctcf)

for s in joindb[0:3]:
    print ("***", s.meta())
    for r in s.bedFile[0:3]:
        print(r.dict())


#
# GMQL cover
#

histo = SFOps.coverS(
          onRegion = BFOps.histoR(BFCover.atleast(1))
        )(ctcf)

print(histo.slurped())

for r in histo[0].bedFile[0:5]: print (r.dict())


cover = SFOps.coverS(
          onRegion = BFOps.coverR(BFCover.atleast(1))
        )(ctcf)

print(cover.slurped())

for r in cover[0].bedFile[0:5]: print (r.dict())

  
summit = SFOps.coverS(
          onRegion = BFOps.summitR(BFCover.atleast(1))
        )(ctcf)

print(summit.slurped())

for r in summit[0].bedFile[0:5]: print (r.dict())


flat = SFOps.coverS(
          onRegion = BFOps.flatR(BFCover.atleast(1))
        )(ctcf)

print(flat.slurped())

for r in flat[0].bedFile[0:5]: print (r.dict())


complement = SFOps.coverS(
               onRegion = BFOps.complementR()
             )(ctcf)

print(complement.slurped())

for r in complement[0].bedFile[0:5]: print (r.dict())


#-----------------------------------------------
#
# "Object-oriented" version of the same queries
#
#-----------------------------------------------


from samplefileops import SampleFile, OnDiskSampleFile, TransientSampleFile
  # OnDiskSampleFile and TransientSampleFile is OnDiskSampleEFile
  # and TransientSampleEFile endowed with methods for selectS, projectS, ...

from encode import importEncodeNPSampleFile
  # import Stefano-formatted ENCODE NP sample files
  # into an OnDiskSampleFile


import time
  # this module is needed for timing studies


def timing(f):
    start = time.time()
    f()
    end = time.time()
    print("*** time: ", end - start, " seconds")


ctcf = importEncodeNPSampleFile(
          "tmp/ctcf/files",
          "tmp/ctcf-list.txt",
          "ctcf")

# You only need to import once. Subsequently, you can simply
# load the imported file like this:
# ctcf = OnDiskSampleFile("ctcf")

ctcf[0].meta()
ctcf[0].tf


#
# Now can do selectS, projectS, etc. in "object-oriented" style...
#


def q2(): return ctcf.onRegion(BFOps.selectR(lambda r: r.chrom == "chr3"))

timing(q2)

for s in q2():
    print("***", s.sid)
    for r in s.bedFile[0:3]:
        print(r.dict())


def q1a(): return ctcf.selectS(onSample = lambda s: len(s.bedFile) > 50000)

timing(q1a)

for s in ctcf: print(s.sid, len(s.bedFile))

for s in q1a(): print(s.sid, len(s.bedFile))


def q3a(): return ctcf.extendS({
        "bedcount": lambda s: len(s.bedFile),
        "mysid": "sid" })

timing(q3a)

for s in q3a: print(s.meta())


q3b = ctcf.projectS({
        "bedcount": lambda s: len(s.bedFile),
        "mysid": "sid" })

for s in q3b: print(s.meta())


def mapres3q(): 
    return ctcf.mapS(
              others = ctcf, 
              onRegion = BFOps.mapR({
                  "avgscore": OpG.average("score"),
                  "stats": OpG.stats("score") }),
              joinby = "sid")

timing(mapres3q)

mapres3 = mapres3q()

mapres3[0].bedFile[0].dict()


grpbysid = mapres3.groupByS(
              grp = "sid",
              aggrs = { "count": OpG.count() })

grpbysid[0].bedFile[2].dict()


grpbylocus = mapres3.onRegion(BFOps.partitionByLociR(
                 aggrs = {
                     "minscore": OpG.smallest("score"),
                     "regcount": OpG.count() }))

grpbylocus[0].bedFile[2].dict()


def diffdb():
    return TransientSampleFile([ctcf[0]]).differenceS(
             others = TransientSampleFile(ctcf[1:]),
             exact = False)

timing(diffdb)

diffdb()[0].bedFile[9].dict()


def joindb():
    return ctcf.joinS(
             others = ctcf,
             onRegion = BFOps.joinR(geno = [DGE(5000), DLE(100000)]))

timing(joindb)

for s in joindb()[0:2]:
    print("***", s.meta())
    for r in s.bedFile[0:2]:
        print(r.dict())


histo = ctcf.coverS(onRegion = BFOps.histoR(BFCover.atleast(2)))

for r in histo[0].bedFile[0:5]: print(r.dict())