

package proteomics


  /** ProtRec, Version 11.1
   *
   *  This is a Scala implementation of the ProtRec missing protein
   *  inference method described in
   *
   *      Kong et al., "PROTREC: A probability-based approach for
   *      recovering missing proteins based on biological networks",
   *      Journal of Proteomics, 250:104392, January 2022. 
   *
   *  ProtRec uses protein complexes as the context for inferring whether
   *  a protein should be in a sample. It computes the prob of a protein X 
   *  being present in the sample given the prob a complex C containing it 
   *  being present in that sample, as below.  
   *
   *    prob(X present) =
   *       prob(X present | C present) * prob(C present) +
   *       prob(X present | C not present) * prob(C not present).
   *
   *    prob(C present) is estimated by fraction of its proteins present.
   *    prob(X present | C present) = 1, as present of complex implies all
   *                                     its proteins are present.
   *    prob(X present | C not present) = FDR of X.
   *  
   *  Wong Limsoon
   *  14 May 2023
   */


  object ProteinRecovery:

    import dbmodel.RESOURCE.Resource
    import dbmodel.Synchronizable.CBI
    import dbmodel.Synchrony.siterator
    import dbmodel.OrderedCollection.Key
    import dbmodel.OpG.{ SUM, BIGGEST }
    // import dbmodel.TSVModel.{ Remy, RemyFile, RemySchema, transientRemyFile }
    import dbmodel.TSVModel.FIELDTYPE.*
    import dbmodel.TSVModel.{ *, given }
    import scala.language.implicitConversions
    import scala.math.{ max, min }


    /** While ProtRec is independent of ProInfer, they share a lot of
     *  data structures and routines. To avoid redundancy, we import 
     *  all these stuff from ProInfer:
     *
     *  types: PROTEIN, CPXID, 
     *         Call = Remy { pro, accPEP, conf, label, fdr, qval },
     *         Cpx = Remy { cpxId, pro, targetProb, decoyProb },
     *         CallFile[K] = RemyFile[Call,K],
     *         CpxFile[K] = RemyFile[Cpx,K]
     *  keys:  kCpxId, kUP
     *  defs:  Call(pro, accPEP, conf, label, fdr, qval), CallFile, 
     *         Cpx(cpx, pro, targetProb, decoyProb),
     *         kUPFile, kProFile, kCpXIdFile
     *  class: ComplexDB
     *  val:   ORGANISM
     */

    import proteomics.ProteinInference.{ *, given }



    /** Default threshold on global FDR and complex size
     */

    val FDR = 0.05         // Global FDR threshold
    val FNR = 0.20         // Global FNR estimate
    val SIZE = 5           // Use complexes of size >= 5
    type PROTREC = Double  // ProtRec score


    /** Remy record [[ProtRecCall(pro, protrec, accPEP,conf,label,fdr,qval)]] 
     *  represents a ProtRec call on a protein
     */

    type ProtRecCall = Remy {
      val pro: PROTEIN
      val protrec: Double
      val accPEP: Double
      val conf: Double
      val label: Int
      val fdr: Double   // local FDR
      val qval: Double  // global FDR
    }

    given ProtRecCallSchema: RemySchema[ProtRecCall] = RemySchema {
      Vector("pro" -> STRING, "protrec" -> DOUBLE, "AccPEP" -> DOUBLE,
             "conf" -> DOUBLE, "label" -> INT, 
             "fdr" -> DOUBLE, "qval" -> DOUBLE)
    }

    def ProtRecCall(
            pro: PROTEIN, 
            protrec: Double,
            label: Int = 1,
            accPEP: Double = 0.0,
            conf: Double = 0.0,
            fdr: Double = 1.0,
            qval: Double = 1.0)
        : ProtRecCall =
      ProtRecCallSchema.make(Array(pro,protrec,accPEP,conf,label,fdr,qval))

    type ProtRecCallFile[K] = RemyFile[ProtRecCall,K]

    given ProtRecCallFile: RemyTSVFile[ProtRecCall] = 
      RemyTSVFile[ProtRecCall](using ProtRecCallSchema)
      // use ProtRecCallFile(filename) to read a ProtRecCallFile from disk. 

    def kProtRec[R<:Remy{val protrec: Double}] = Key.dsc[R,Double](_.protrec)
    
    extension [R<:Remy{val protrec: Double}](r: IterableOnce[R])
      def kProtRecFile(using RemyTSVFile[R]) = r.transientRemyFile(kProtRec[R])



    extension (complexes: CpxFile[PROTEIN])

      /** Estimate the prob of the presence of a complex based on 
       *  the fraction of its constituent proteins that are called. 
       *  Estimate the prob of the presence of a decoy complex based 
       *  on the fraction of its constituent decoy proteins called. 
       *
       *  @param complexes  A file of complexes, ordered by sorting key [[kUP]]
       *  @param calls      A file of protein calls, ordered by [[kUP]]. 
       *                    Given an entry [[p]] in the file, 
       *                    assume [[p.qval]] is the global fdr corresponding
       *                    at the rank of protein [[p.pro]].
       *  @param threshold  Use only proteins where [[p.qval < threshold]].
       *  @return a copy of [[complexes]] updated with the prob of presence
       *          for each complex and its decoy. The file is ordered
       *          by [[kUP]].
       */

      def probPresent[K](
              calls: CallFile[PROTEIN], 
              size: Int = SIZE,
              qval: Double = FDR,
              fnr: Double = FNR)
          : CpxFile[PROTEIN] = {
        val proteinCalls = calls.filter(_.label == 1)
        val decoyCalls = calls.filter(_.label == -1)
        val canSee = (y: PROTEIN, x: PROTEIN) => y == x
        val pSI = proteinCalls.siterator(kUP, canSee)
        val dSI = decoyCalls.siterator(kUP, canSee)
        val fdr = (p: Call) => if p.qval < qval then p.qval else 1.0 
        val targetProb = (p: Cpx) => p.targetProb 
        val decoyProb = (p: Cpx) => p.decoyProb 

        // For a complex, identify its proteins/decoys that got called.
        val cpxHits = {
          for cp <- complexes
          yield
            val ps = pSI.syncedWith(cp)
            val ds = dSI.syncedWith(cp)
            val tp = if ps.isEmpty then 0.0 else 1.0 - (BIGGEST(fdr) of ps)
            val fp = if ds.isEmpty then 0.0 else 1.0 - (BIGGEST(fdr) of ds)
            Cpx(cp.cpxId, cp.pro, tp, fp)
        }.kCpxIdFile.ordered

        // For a complex, compute the fraction of its proteins & decoys called.
        try 
          cpxHits
            .clustered
            .flatMap( { case (cpxId, es) =>
               // ln is the expected # of reported proteins
               // from complexcpxId at the given FNR. 
               // A damping size is added to ln to make it
               // numerically/statistically more stable.
               val ln = max(size.toDouble, es.length.toDouble * (1 - fnr))
               // A complex may have slightly more or slightly
               // fewer reported proteins than given by FNR.
               // So, need to sure probabilities tp and fp <= 1.
               val tp = min(1.0, (SUM(targetProb) of es)/ ln)
               val fp = min(1.0, (SUM(decoyProb) of es) / ln)
               es.map(e => Cpx(cpxId, e.pro, tp, fp)) } )
            .kUPFile
            .ordered
            .materialized
        finally Resource.closeAll(cpxHits, dSI, pSI, decoyCalls, proteinCalls)
      }


      /** Compute the prob of a protein X being present given the
       *  prob a complex C containing it being present.  If X is in
       *  multiple complexes, take the max. Do these for all proteins
       *  in a [[CallFile]].
       *
       *  @param complexes  A file of complexes, ordered by sorting key [[kUP]]
       *                    Given an entry [[c]] in the file, 
       *                    assume [[c.targetProb]] is the prob of complex
       *                    [[c.cpxId]] being present, and [[c.decoyProb]]
       *                    is the prob of the corresponding decoy complex
       *                    being present. 
       *  @param calls      A file of protein calls, ordered by [[kUP]]. 
       *                    Given an entry [[p]] in the file, 
       *                    assume [[p.qval]] is the global fdr at the
       *                    rank of protein [[p.pro]].
       *  @return a copy of [[calls]], with updated [[protrec]] scores.
       *          The file is ordered by [[kUP]]. 
       */

      def proteinProbs(calls: CallFile[PROTEIN]): ProtRecCallFile[PROTEIN] = {
        val canSee = (y: PROTEIN, x: PROTEIN) => y == x
        val cpxSI = complexes.siterator(kUP, canSee)
        val probs: CBI[ProtRecCall] = calls.map { c => 
          val cs = cpxSI.syncedWith(c)
          val ps = 
            for h <- cs
            yield
              val p = if c.label == 1 then h.targetProb else h.decoyProb
              val n = 1 - p
              p + (1 - c.qval) * n
          val protrec = if ps.isEmpty then 0 else ps.max
          ProtRecCall(c.pro, protrec, c.label,c.accPEP,c.conf,c.fdr,c.qval)
        }
        try probs.kUPFile.serialized.use(calls, complexes)
        finally Resource.closeAll(probs, cpxSI)
      }


      /** Compute the prob of a protein X being present given the
       *  prob a complex C containing it being present.  If X is in
       *  multiple complexes, take the max. Do these for all proteins
       *  in [[complexes]] but not in a given [[CallFile]]. Those
       *  proteins with high prob are the missing proteins recovered.
       *
       *  @param complexes  A file of complexes, ordered by sorting key [[kUP]]
       *                    Given an entry [[c]] in the file, 
       *                    assume [[c.targetProb]] is the prob of complex
       *                    [[c.cpxId]] being present, and [[c.decoyProb]]
       *                    is the prob of the corresponding decoy complex
       *                    being present. 
       *  @param calls      A file of protein calls, ordered by [[kUP]]. 
       *                    Given an entry [[p]] in the file, 
       *                    assume [[p.qval]] is the global fdr at
       *                    the rank of protein [[p.pro]].
       *  @return a [[ProtRecCallFile]] for these missing proteins.
       *          The file is ordered by [[kUP]]. 
       */

      def missingProteinProbs(calls: CallFile[PROTEIN]): ProtRecCallFile[PROTEIN] = {
        val canSee = (y: PROTEIN, x: PROTEIN) => y == x
        val targetProb = (e: Cpx) => e.targetProb
        val decoyProb = (e: Cpx) => e.decoyProb
        val ky = Key.asc((p: PROTEIN) => p)
        val callSI = calls.siterator(ky, canSee)
        val proteinProb: ((PROTEIN,Vector[Cpx])) => Vector[ProtRecCall] = _ match
          case (p: PROTEIN, es: Vector[Cpx]) =>
            if !callSI.syncedWith(p).isEmpty then Vector()
            else 
              val protein = 
                val protrec = BIGGEST(targetProb) of es
                val pro = s"RECOVERED|$p"
                ProtRecCall(pro, protrec, label = 1)
              val decoy =
                val protrec = BIGGEST(decoyProb) of es
                val pro = s"DECOY_RECOVERED|$p"
                ProtRecCall(pro, protrec, label = -1)
              Vector(protein, decoy)
        val missing = complexes.clustered.flatMap(proteinProb)
        try missing.kUPFile.serialized.use(calls)
        finally Resource.closeAll(missing, callSI)
      }

  
      /** Compute the prob of a protein X being present given the
       *  prob a complex C containing it being present.  If X is in
       *  multiple complexes, take the max. Do these for all proteins
       *  in [[complexes]] or in given [[CallFile]]. 
       *
       *  @param complexes  A file of complexes, ordered by sorting key [[kUP]]
       *                    Given an entry [[c]] in the file, 
       *                    assume [[c.targetProb]] is the prob of complex
       *                    [[c.cpxId]] being present, and [[c.decoyProb]]
       *                    is the prob of the corresponding decoy complex
       *                    being present. 
       *  @param calls      A file of protein calls, ordered by [[kUP]]. 
       *                    Given an entry [[p]] in the file, 
       *                    assume [[p.qval]] is the global fdr at the rank
       *                    of protein [[p.pro]].
       *  @return a [[CallFile]] for missing proteins & proteins in [[calls]].
       *          The file is ordered by [[protrec]] score. 
       */

      def allProteinProbs(calls: CallFile[PROTEIN]): ProtRecCallFile[PROTREC] = {
        val inCallFile = proteinProbs(calls)
        val notInCallFile = missingProteinProbs(calls)
        val merged = inCallFile.mergedWith(notInCallFile)
        try merged.kProtRecFile.ordered.serialized.use(calls)
        finally Resource.closeAll(merged, notInCallFile, inCallFile)
      }


    /** ProtRec
     */

    case class ProtRec(complexDB: ComplexDB):

      /** @require The [[qval]] field of [[calls]] is the global FDR at
       *           the rank of the corresponding protein [[pro]]
       */

      def recoverAllProteins[K](
              calls: CallFile[K], 
              size: Int = SIZE,
              qval: Double = FDR,
              fnr: Double = FNR)
          : ProtRecCallFile[PROTREC] =
        val kUPCalls = calls.kUPFile.ordered.serialized
        val cpxProbs = complexDB.cpxPro.probPresent(kUPCalls, size, qval, fnr)
        try cpxProbs.allProteinProbs(kUPCalls)
        finally { cpxProbs.close(); kUPCalls.close() } 

 
      /** If a protein does not appear in the complex database,
       *  ProtRec assigns it a protrec score of zero. So, it
       *  it better to clean up by excluding these proteins.
       *
       *  @require The [[qval]] field of [[calls]] is the global FDR at
       *           the rank of the corresponding protein [[pro]]
       */

      def recoverProteins[K](
               calls: CallFile[K], 
               size: Int = SIZE,
               qval: Double = FDR,
               fnr: Double = FNR)
           : ProtRecCallFile[PROTREC] =
        val kUPCalls = calls.kUPFile.ordered
        val ky = Key.asc((p: PROTEIN) => p)
        val canSee = (y: PROTEIN, x: PROTEIN) => y == x
        val callSI = kUPCalls.siterator(ky, canSee)
        val cleaned = 
          val proteins = complexDB.proteins
          val vec = for (p <- proteins; c <- callSI.syncedWith(p)) yield c
          vec.kUPFile
        try recoverAllProteins(cleaned, size, qval, fnr)
        finally Resource.closeAll(cleaned, callSI, kUPCalls)


    object ProtRec:

      def apply(
              callFile: String, 
              outFile: String, 
              complexFile: String, 
              qval: Double = FDR,
              fnr: Double = FNR,
              organism: String = ORGANISM,
              size: Int = SIZE,
              omitDecoys: Boolean = true)
          : ProtRecCallFile[PROTREC] =
        val calls = CallFile(callFile)
        val complexDB = ComplexDB(complexFile, organism, size)
        val noOmit = ProtRec(complexDB).recoverProteins(calls,size, qval, fnr)
        val result = if omitDecoys then noOmit.filter(_.label == 1) else noOmit
        try result.saveAs(outFile)
        finally Resource.closeAll(result, noOmit, complexDB, calls)



      /** [[protrec]] is intended as a commandline-executable.
       *  It can be run from the commandline like this:
       *  scala.bat proteomics.protrec CALLS OUT CPLX 0.05 Human 4
       */

      @main def protrec(args: String*): Unit = {
        val doc =
          """
          | ProtRec is a missing protein inference method. It uses
          | protein complexes as a context for inferring whether a
          | protein should be present in a sample.
          |
          | Invoke it like this:
          |
          |     scala proteomics.protrec \
          |       callFile outFile complexFile \
          |       threshold organism size omit
          |
          | The meaning of these parameters are as follow:
          |
          |   callFile   - File of proteins called by e.g. ProInfer
          |   outFile    - Name of the output file.
          |                 The output contains the missing proteins
          |                 called by ProtRec, as well as proteins in
          |                 in input callFile with the scores that
          |                 ProtRec has assigned to them.
          |                 File is sorted in descending ProtRec score.
          |   complexFile- Reference protein complexes, CORUM format.
          |   qval       - Consider protein p in callFile with 
          |                 p.qval < qval (default: 0.05).
          |                 Note that ProtRec interprets qval as global FDR.
          |                 If the qval in callFile does not correspond to
          |                 global FDR, ProtRec predictions are less reliable. 
          |   fnr        - Estimated global FNR (default: 0.20)
          |   organism   - Use only complexes of this organism (default: Human).
          |   size       - Use only complexes larger than this (default: 4).
          |   omit       - Leave this blank if decoys are to be omitted;
          |                 otherwise, put 'no-omit' here.
          | """.stripMargin              

        val numArgs = args.length
        def sArg(p: Int, d: String) = if p < numArgs then args(p) else d 
        def dArg(p: Int, r: Double) = if p < numArgs then args(p).toDouble else r
        def iArg(p: Int, i: Int) = if p < numArgs then args(p).toInt else i

        if args.contains("--help") then println(doc)
        else if numArgs < 3 then println("Not enough arguments")
        else
          val callFile = args(0)
          val outFile = args(1)
          val complexFile = args(2)
          val qval = dArg(3, FDR)
          val fnr = dArg(4, FNR)
          val organism = sArg(5, ORGANISM)
          val size = iArg(6, SIZE)
          val omit = !args.contains("no-omit")
          val run = ProtRec(
            callFile, outFile, 
            complexFile, qval, fnr, organism, size, omit)

          try  
            // Do some analysis on the results
            val rescued = run
              .filter { c => 
                 c.label == 1 && 
                 c.qval >= qval &&
                 c.protrec > 1 - c.qval }
               .toSeq
            println(s"\nAnalyzing proteins originally at qval >= $qval")
            println(s"Assuming FNR is $fnr")
            println("# of proteins that would be rescued:")
            for t <- Seq(0.99, 0.95, 0.90, 0.85, 0.80, 0.75, 0.70)
            do
              val r = rescued.filter { _.protrec > t }
              val m = r.filter { _.pro.startsWith("RECOVERED") }
              print(s"... at ProtRec score > $t: ${r.length}; ")
              println(s" of these ${m.length} were totally missing.")
          finally run.close()

      }

    end ProtRec

  end ProteinRecovery



/** Example ************************************************************
 *
 *
{{{


   import proteomics.ProteinInference.{ ProInfer, globalFDRComputed }
   import proteomics.ProteinRecovery.ProtRec

//
// Test data
//


   def db(s: String) = "test-proinfer/" ++ s
   val REFPROTEINS  = db("human-proteins-including-decoys.fasta")
   val REFCOMPLEXES = db("complexes.txt")
   val INPUT        = db("DDA1.tsv")
   val CALLFILE     = "DDA1-called-proteins.tsv"
   val CORRECTED    = "DDA1-fdr-corrected.tsv"
   val OUTPUT       = "DDA1-protrec-proteins.tsv"

//
// Run ProInfer on test data above using default parameter values
// This generates a protein call file that can be used by ProtRec
//


   val calls = ProInfer(INPUT, CALLFILE, REFPROTEINS, REFCOMPLEXES)


//
// The qval field of ProInfer's output is not global FDR.
// Need to correct this.
//


   val corrected = calls.globalFDRComputed.saveAs(CORRECTED)


//
// Now run ProtRec using default parameter values
//


   ProtRec(CORRECTED, OUTPUT, REFCOMPLEXES)

   calls.close()
   corrected.close()

}}}
 *
 *
 */




