

package proteomics

  /** ProInfer, version 11.1
   *
   *  This Scala implementation improved on the original Python codes
   *  of Peng Hui, provided at https://github.com/PennHui2016/ProInfer.
   *  It is more than 10x more efficient, even accounting for the
   *  efficiency advantage of Scala vs Python.
   *
   *  Reference: 
   *    Peng, Wong, & Goh, "ProInfer: An interpretable protein
   *    inference tool leveraging on biological networks",
   *    PLoS Computational Biology, 19(3):e1010961, March 2023.
   *
   *  In mass spectrometry (MS)-based proteomics, protein inference
   *  from identified peptides (protein fragments) is a critical step. 
   *  ProInfer (Protein Inference) is a novel protein assembly method
   *  that takes advantage of information in biological networks.
   *  ProInfer assists recovery of proteins supported only by ambiguous
   *  peptides (a peptide which maps to more than one candidate protein)
   *  and enhances the statistical confidence for proteins supported by
   *  both unique and ambiguous peptides. Consequently, ProInfer rescues
   *  weakly supported proteins thereby improving proteome coverage.
   *  Evaluated across THP1 cell line, lung cancer and RAW267.4 datasets,
   *  ProInfer always infers more numbers of true positives than mainstream 
   *  protein inference tools Fido, EPIFANY and PIA. ProInfer is also
   *  adept at retrieving differentially expressed proteins, signifying
   *  its usefulness for functional analysis and phenotype profiling.
   *
   *  Wong Limsoon
   *  14 May 2023
   */



  object ProteinInference:

    import dbmodel.RESOURCE.Resource
    import dbmodel.OrderedCollection.{ OSeq, Key }
    import dbmodel.Synchrony.siterator
    import dbmodel.OpG.{ SUM, PROD, AVERAGE, SMALLEST }
    import dbmodel.DBFile.OFile
    import dbmodel.TSVModel.{ Remy, RemyFile, RemySchema, RemyTSVFile, transientRemyFile }
    import dbmodel.TSVModel.FIELDTYPE.*
    import proteomics.FASTAModel.{ FASTAEntry, FASTAFile, FASTAFILE }
    import proteomics.PEPTIDEModel.{ PeptideFile, PEPTIDEFILE }
    import proteomics.CORUMModel.{ CorumFile, CORUMFILE }
    import scala.math.log10
    import scala.language.implicitConversions


    /** Default values
     */

    val THRESHOLD    = 0.999
    val QVAL         = 0.01
    val ORGANISM     = "Human"
    val SIZE         = 0   // Consider complexes of size >= SIZE.


    /** Use named type for readability
     */

    type PROTEIN = String
    type PEPTIDE = String
    type CPXID   = Int
    type CONFrev = Double


    /** [[Hit(pep, pro, score, falseProb)]]
     *  represents a PSM
     */

    type Hit = Remy {
      val pep: PEPTIDE
      val pro: PROTEIN
      val score: Double
      val falseProb: Double
    }

    given HitSchema: RemySchema[Hit] = RemySchema {
      Vector("pep" -> STRING, "pro" -> STRING, 
             "score" -> DOUBLE, "falseProb" -> DOUBLE)
    }

    def Hit(pep:PEPTIDE, pro:PROTEIN, score:Double, falseProb:Double = 1.0) =
      HitSchema.make(Array(pep, pro, score, falseProb))

    type HitFile[K] = RemyFile[Hit,K]

    given HitFile: RemyTSVFile[Hit] = RemyTSVFile[Hit](using HitSchema)
    // use HitFile(filename) to read a HitFile from disk. 


    /** [[Call(pro, accPEP, conf, label, fdr, qval)]] 
     *  represents a call on a protein
     */

    type Call = Remy {
      val pro: PROTEIN
      val accPEP: Double
      val conf: Double
      val label: Int
      val fdr: Double
      val qval: Double
    }

    given CallSchema :RemySchema[Call] = RemySchema {
        Vector("pro" -> STRING, "accPEP" -> DOUBLE, "conf" -> DOUBLE,
               "label" -> INT, "fdr" -> DOUBLE, "qval" -> DOUBLE)
    }

    def Call(pro: PROTEIN, accPEP: Double, conf: Double, label: Int, fdr: Double = 1.0, qval: Double = 1.0) =
      CallSchema.make(Array(pro, accPEP, conf, label, fdr, qval))

    type CallFile[K] = RemyFile[Call,K]

    given CallFile: RemyTSVFile[Call] = RemyTSVFile[Call](using CallSchema)
    // use CallFile(filename) to read a CallFile from disk. 


    /** [[Cpx(cpxId, pro, targetProb, decoyProb)]] 
     *  represents a protein complex and its member proteins.
     *  I.e., Cpx(x,y,_,_) iff complex x has protein y as a member protein.
     */

    type Cpx = Remy {
      val cpxId: CPXID
      val pro: PROTEIN 
      val targetProb: Double
      val decoyProb: Double
    }

    given CpxSchema: RemySchema[Cpx] = RemySchema {
        Vector("cpxId" -> INT, "pro" -> STRING, 
               "targetProb" -> DOUBLE, "decoyProb" -> DOUBLE) 
    }

    def Cpx(cpxId: CPXID, pro: PROTEIN, targetProb: Double = 1.0, decoyProb: Double = 1.0) =
      CpxSchema.make(Array(cpxId, pro, targetProb, decoyProb)) 

    type CpxFile[K] = RemyFile[Cpx,K]

    given CpxFile: RemyTSVFile[Cpx] = RemyTSVFile[Cpx](using CpxSchema) 
    // use CpxFile(filename) to read a CpxFile from disk. 


    /** Deal with heterogeneous naming formats of Uniprot ID.
     *  The ID of reference proteins usually has the form ">sp|xxxx".
     *  The ID of decoy proteins has the form ">DECOY_sp|xxxx".
     *  So, we just cut of the part before the "|".
     */

    extension [R<:Remy{ val pro: PROTEIN }](r: R)
      def uniprot: String =
        val u = r.pro
        val p = u.split('|')
        if (p.length > 1) p(1) else u


    /** Allow hits/PSM file to be sorted by pep and/or by pro
     *  Allow Call file to be sorted by conf, in descending order.
     *  Allow Complex file to be sorted by cpxId or pro.
     */

    def kPep[R<:Remy{val pep: PEPTIDE}] = Key.asc[R,PEPTIDE](_.pep)   

    def kPro[R<:Remy{val pro: PROTEIN}] = Key.asc[R,PROTEIN](_.pro)

    def kPepPro[R<:Remy{val pep: PEPTIDE; val pro: PROTEIN}] = 
      Key.asc[R,(PEPTIDE,PROTEIN)](h => (h.pep, h.pro))

    def kConfrev[R<:Remy{val conf: Double}] = Key.dsc[R,Double](_.conf)

    def kCpxId[R<:Remy{val cpxId: CPXID}] = Key.asc[R,CPXID](_.cpxId)

    def kUP[R<:Remy{val pro: PROTEIN}] = Key.asc[R,String](_.uniprot)

    def kFDR[R<:Remy{val fdr: Double}] = Key.asc[R,Double](_.fdr)
    

    /** PSMs & protein calls are mapped to an on-disk TSV file, 
     *  via Remy database connectivity encoder/decoder.
     */

    extension [R<:Remy{val pep:PEPTIDE}](r: IterableOnce[R])
      def kPepFile(using RemyTSVFile[R]): RemyFile[R,PEPTIDE] = r.transientRemyFile(kPep[R])

    // extension [R<:Remy{val pep:PEPTIDE}](r: OSeq[R,_])
    //  def kPepFile(using RemyTSVFile[R]): RemyFile[R,PEPTIDE] = 
    //    r.assumeOrderedBy(kPep[R]).transientRemyFile

    extension [R<:Remy{val pro:PROTEIN}](r: IterableOnce[R])
      def kProFile(using RemyTSVFile[R]): RemyFile[R,PROTEIN] = r.transientRemyFile(kPro[R])
      def kUPFile(using RemyTSVFile[R]): RemyFile[R,PROTEIN] = r.transientRemyFile(kUP[R])

    extension [R<:Remy{val pro: PROTEIN; val pep: PEPTIDE}](r: IterableOnce[R])
      def kPepProFile(using RemyTSVFile[R]): RemyFile[R,(PEPTIDE,PROTEIN)] = r.transientRemyFile(kPepPro[R])

    // extension [R<:Remy{val pro: PROTEIN; val pep: PEPTIDE}](r: OSeq[R,_])
    //  def kPepProFile(using RemyTSVFile[R]): RemyFile[R,(PEPTIDE,PROTEIN)] = 
    //    r.assumeOrderedBy(kPepPro[R]).transientRemyFile

    extension [R<:Remy{val conf:Double}](r: IterableOnce[R])
      def kConfrevFile(using RemyTSVFile[R]): RemyFile[R,Double] = r.transientRemyFile(kConfrev[R])

    extension [R<:Remy{val cpxId:CPXID}](r: IterableOnce[R])
      def kCpxIdFile(using RemyTSVFile[R]): RemyFile[R,CPXID] = r.transientRemyFile(kCpxId[R])

    extension [R<:Remy{val fdr:Double}](r: IterableOnce[R])
      def kFDRFile(using RemyTSVFile[R]): RemyFile[R,Double] = r.transientRemyFile(kFDR[R])


    /** Reference proteins and decoys
     */

    case class ProteinDB(filename: String) extends Resource {

      // Read reference proteins and decoys
      // Extract protein name, length.
      val proteins = FASTAFile(filename)

      val proteinLen =
        proteins
          .map { p => (p.name.split(" ")(0) -> p.seq.length) }
          .toMap

      // Endow proteins with len methods
      extension (pro: PROTEIN)
        def len: Int = proteinLen(pro)

      // Initialization codes
      use(proteins)
    }


    /** Reference protein complexes
     */

    case class ComplexDB(
          filename: String,
          organism: String = ORGANISM,
          size: Int = SIZE)
        extends Resource:
  
      // Read reference complexes
      val complexes = CorumFile(filename)

      // Extract protein-complex associations
      val cpxPro =
        complexes
          .filter(c => c.organism == organism && c.subunits.length >= size)
          .flatMap(c => c.subunitsUniprot.map(p => Cpx(c.complexID, p)))
          .kUPFile
          .ordered
          .materialized

      // Extract proteins
      val proteins = cpxPro.filter(_.pro != "").clustered.map(_._1).toVector

    end ComplexDB


    /** Select peptides with score below a threshold.
     *  For (pep, pro) matching a few times, keep the lowest-score one.
     */

    extension (peptides: PEPTIDEFILE)
      def selected(threshold: Double = THRESHOLD): HitFile[PEPTIDE] =
        val sortedByPepPro = peptides
          .filter(_.score <= threshold)
          .flatMap(p => p.hits.map(h => Hit(p.seq, h.accession, p.score)))
          .kPepProFile
          .ordered

        val selection = sortedByPepPro
          .clustered
          .map({ case ((pep, pro), ps) => Hit(pep, pro, ps.map(_.score).min) })
          .kPepFile
          .serialized.use(peptides)

        sortedByPepPro.close()
        selection


    /** Compute false prob to be used for each PSM, i.e., the prob 
     *  that this PSM is not a hit of the peptide to the protein.
     */

    extension (hits: HitFile[PEPTIDE])
      def normalized(proteinDB: ProteinDB): HitFile[PEPTIDE] = {
        import proteinDB.*

        def normalizeHits(hs: Vector[Hit]): Vector[Hit] = 
          def weight(h: Hit) = 1.0 / h.pro.len.toDouble
          val totalWt = SUM(weight) of hs
          def factor(h: Hit) = weight(h) / totalWt
          def falseProb(h: Hit) = 1.0 - ((1.0 - h.score) * factor(h)) 
          hs.map(h => Hit(h.pep, h.pro, h.score, falseProb(h)))

        hits
          .assumeOrderedBy(kPep[Hit])    // Assume hits are already sorted on kPep
          .clustered
          .flatMap { case (p, hs) => normalizeHits(hs) }
          .kPepFile
          .serialized.use(hits)
      }



    /** Compute false reporting prob for proteins
     *  Sorted in descending order of their conf values.
     */

    extension (hits: HitFile[PEPTIDE])
      def scoredProteins: CallFile[CONFrev] = {
        def confScore(p: PROTEIN, hs: Vector[Hit]): Call =
          val accPEP = PROD[Hit,Double](_.falseProb) of hs
          val conf   = -10 * log10(accPEP + 1E-14)
          val label  = if (p startsWith "DECOY") -1 else 1
          Call(p, accPEP, conf, label)

        val sortedByPro = hits.orderedBy(kPro)
        val scored = sortedByPro
          .clustered
          .map({ case (p, hs) => confScore(p, hs) })
          .kConfrevFile
          .ordered
          .serialized.use(hits)

        sortedByPro.close()
        scored
      }


    /** Compute local FDR. The local FDR of a protein with conf = s is
     *  computed as (1 + # decoy with conf >= s)/(1 + # target with conf >= s)
     *  Sorted in ascending order of their FDR values.
     *  NB. This defn of FDR is not the usual statistical FDR; but it is
     *  used widely in proteomics, e.g., EPIFANY and Fido on the openMS platform.
     */

    extension (calls: CallFile[CONFrev])
      def fdrComputed: CallFile[CONFrev] = {
        var ndecoy  = 0
        var ntarget = 0
  
        def lfdr(sc: (Double,Vector[Call])): Vector[Call] = 
          val (s, calls) = sc
          val decoys = calls.filter(_.label == -1).length
          val targets = calls.filter(_.label == 1).length
          ndecoy = ndecoy + decoys
          ntarget = ntarget + targets
          val fdr = (1.0 + ndecoy.toDouble) / (1.0 + ntarget.toDouble)
          calls.map(c => Call(c.pro, c.accPEP, c.conf, c.label, fdr))
  
        calls
          .kConfrevFile  // assume protein calls in descending order of conf
          .clustered
          .flatMap(lfdr)
          .kConfrevFile
          .serialized.use(calls)
      }


    /** Compute Q value. Q value of a protein X is the best FDR among
     *  proteins whose conf value is no better than X's. 
     */

    extension (calls: CallFile[CONFrev])
      def qvalComputed: CallFile[CONFrev] = {
        var qval = 1.0
        val ascending = calls
          .kConfrevFile // assume protein calls in descending order of conf
          .reversed     // make protein calls in ascending order of conf
        val computed = ascending
          .map(c => 
             qval = qval min c.fdr
             Call(c.pro, c.accPEP, c.conf, c.label, c.fdr, qval))
          .kConfrevFile
          .reversed     // put protein calls into descending order of conf
          .serialized.use(calls)
        ascending.close()
        computed
      }


    /** The Q value above is adopted by many proteomics MS software.
     *  But it does not actually correspond to global FDR. So, here
     *  is an alternative which computes global FDR instead of Q.
     */

    extension (calls: CallFile[CONFrev])
      def globalFDRComputed: CallFile[CONFrev] = {
        var qval = 1.0
        var decoys = 0.0
        var proteins = 0.0
        calls
          .kConfrevFile // assume protein calls in descending order of conf
          .map(c => 
             if c.label == 1 then proteins = proteins + 1.0
             else decoys = decoys + 1.0
             qval = decoys / (decoys + proteins)
             Call(c.pro, c.accPEP, c.conf, c.label, c.fdr, qval))
          .serialized.use(calls)
      }



    /** Compute prob of a protein complex being present.
     *  This is computed as the mean accPEP of its protein members.
     *  Smaller better.
     */

    extension (cpxDB: ComplexDB)
      def cpxAnnotated(calls: CallFile[CONFrev]): CpxFile[CPXID] = {
        import cpxDB._
        val saved = calls.serialized
        val targets = saved.filter(_.label == 1).kUPFile.ordered
        val decoys  = saved.filter(_.label == -1).kUPFile.ordered
        val canSee  = (y: String, x: String) => y == x
        val targetSI = targets.siterator(kUP, canSee)
        val decoySI = decoys.siterator(kUP, canSee)

        val annotated = {
          for cp <- cpxPro
          yield
            val ts = targetSI.syncedWith(cp);
            val ds = decoySI.syncedWith(cp);
            val tprob = if (ts.isEmpty) 1.0 else ts.map(_.accPEP).min
            val dprob = if (ds.isEmpty) 1.0 else ds.map(_.accPEP).min
            Cpx(cp.cpxId, cp.pro, tprob, dprob)
        }.kCpxIdFile.ordered

        try 
          annotated
            .clustered
            .flatMap({ case (cpxId, ps) => 
              val pt = ps.filter(_.targetProb < 1.0)
              val pd = ps.filter(_.decoyProb < 1.0)
              val t = 
                if pt.length == 0 then 1.0
                else AVERAGE[Cpx](_.targetProb) of pt
              val d = 
                if pd.length == 0 then 1.0 
                else AVERAGE[Cpx](_.decoyProb) of pd
              ps.map(p => Cpx(cpxId, p.pro, t, d))}) 
            .kCpxIdFile
            .serialized.use(calls)
        finally
          annotated.close()
          targetSI.close()
          decoySI.close()
          targets.close()
          decoys.close()
          saved.close()
        }


    /** Update prob of a protein present
     *  as     min(p.accPEP, min(c.targetProb | p is in complex c))
     *  and as min(p.accPEP, min(c.decoyProb  | p is in complex c))
     */  

    extension (calls: CallFile[CONFrev])
      def cpxComputed(complexDB: ComplexDB): CallFile[CONFrev] = {
        val saved = calls.serialized
        val qs = saved.kUPFile.ordered
        val annot = complexDB.cpxAnnotated(saved)  
        val cs = annot.kUPFile.ordered
        val canSee  = (y: String, x: String) => y == x 
        val cSI = cs.siterator(kUP, canSee)

        def cProb(l: Int) = (c: Cpx) => 
          if l== 1 then c.targetProb else c.decoyProb 

        val scored = {
          for q <- qs
          yield
            val cps   = cSI.syncedWith(q)
            val cPEP  = 
              if cps.length == 0 then q.accPEP
              else SMALLEST(cProb(q.label)) of cps
            val cConf = -10 * log10(cPEP + 1E-14)
            val pep   = if q.accPEP >= cPEP then cPEP else q.accPEP
            val conf  = if q.accPEP >= cPEP then cConf else q.conf
            Call(q.pro, pep, conf, q.label)
        }.kConfrevFile.ordered

        try 
          scored
            .fdrComputed
            .qvalComputed
            .serialized.use(calls)
        finally 
          scored.close()
          cSI.close()
          cs.close()
          annot.close()
          qs.close()
          saved.close()
      }


    /** ProInfer
     */

    case class ProInfer(proteinDB: ProteinDB, complexDB: ComplexDB):
 
      // Without using protein complex info
      def runNoCpx(
            peptides: PEPTIDEFILE,
            threshold: Double = THRESHOLD): CallFile[CONFrev] =
        peptides
          .selected(threshold)
          .normalized(proteinDB)
          .scoredProteins
          .fdrComputed
          .qvalComputed

      // Use protein complex info to refine protein calls
      def runCpx(
            peptides: PEPTIDEFILE,
            threshold: Double = THRESHOLD): CallFile[CONFrev] =
        runNoCpx(peptides, threshold).cpxComputed(complexDB)

      // Iterate runCpx until too few extra proteins are called.
      def iterateRunCpx(
            peptides: PEPTIDEFILE,
            prefix: String = "run-",
            threshold: Double = THRESHOLD,
            qvalThreshold: Double = QVAL): CallFile[CONFrev] = {

        def check(rno: Int, runA: CallFile[CONFrev], runB: CallFile[CONFrev]) =
          val a = runA.filter(p => p.label == 1 && p.qval < qvalThreshold)
          val b = runB.filter(p => p.label == 1 && p.qval < qvalThreshold)
          val aLen = a.done { _.length }
          val bLen = b.done { _.length }
          println(s"* rno=$rno * aLen=$aLen * bLen=$bLen * diff=${bLen - aLen}")
          (bLen - aLen) < 10

        var finished = false
        var rno = 0
        var runA = runNoCpx(peptides, threshold).saveAs(s"${prefix}-${rno}")

        while !finished do
          rno = rno + 1
          val runB = runA.cpxComputed(complexDB).saveAs(s"${prefix}-${rno}")
          finished = check(rno, runA, runB)
          runA = runB

        runA       
      }


    object ProInfer:

      /** Call proteins in a PSM file (peptideFile), 
       *  using a reference protein/decoy database (proteinFile)
       *  and a reference protein complex database (complexFile).
       *  Iterate until too few new proteins are called.
       */

      def apply(
            peptideFile: String,
            outFile: String,
            proteinFile: String,
            complexFile: String,
            organism: String = ORGANISM,
            prefix: String = "run-",
            threshold: Double = THRESHOLD,
            qvalThreshold: Double = QVAL): CallFile[CONFrev] =
        val proteins  = ProteinDB(proteinFile)
        val peptides  = PeptideFile(peptideFile)
        val complexes = ComplexDB(complexFile, organism)
        try 
          ProInfer(proteins, complexes)
            .iterateRunCpx(peptides, prefix, threshold, qvalThreshold)
            .saveAs(outFile)
        finally 
          proteins.close()
          peptides.close()
          complexes.close()
    

      /** Call proteins in a PSM file (peptideFile), 
       *  using a reference protein/decoy database (proteinFile)
       *  and a reference protein complex database (complexFile).
       *  One round of inference only, no iteration.
       */

      def runCpx(
            peptideFile: String,
            outFile: String,
            proteinFile: String,
            complexFile: String,
            organism: String = ORGANISM,
            threshold: Double = THRESHOLD): CallFile[CONFrev] =
        val proteins  = ProteinDB(proteinFile)
        val peptides  = PeptideFile(peptideFile)
        val complexes = ComplexDB(complexFile, organism)

        try
          ProInfer(proteins, complexes)
            .runCpx(peptides, threshold)
            .saveAs(outFile)
        finally 
          proteins.close()
          peptides.close()
          complexes.close()


      /** Call proteins in a PSM file (peptideFile), 
       *  using a reference protein/decoy database (proteinFile).
       *  Not using a reference protein complex database (complexFile).
       *  One round of inference only, no iteration.
       */

      def runNoCpx(
            peptideFile: String,
            outFile: String,
            proteinFile: String, 
            threshold: Double = THRESHOLD): CallFile[CONFrev] =
        val proteins = ProteinDB(proteinFile)
        val peptides = PeptideFile(peptideFile)
        try 
          ProInfer(proteins, null)
            .runNoCpx(peptides, threshold)
            .saveAs(outFile)
        finally 
          proteins.close()
          peptides.close()


      /** Analysis of runCpx vs runNoCpx
       */

      def analysis(
            cpxCallFile: String,        // results of runCpx 
            nocpxCallFile: String,      // results of runNoCpx
            prefix:        String,      // prefix for saving analysis results
            qvalThreshold: Double) = {

        val cpx: CallFile[PROTEIN] =
          RemyFile[Call](cpxCallFile)
            .filter(p => p.label == 1 && p.qval < qvalThreshold)
            .kProFile
            .ordered
            .saveAs(s"${prefix}-cpx.result")

        val nocpx: CallFile[PROTEIN] = 
          RemyFile[Call](nocpxCallFile)
            .filter(p => p.label == 1 && p.qval < qvalThreshold)
            .kProFile
            .ordered
            .saveAs(s"${prefix}-nocpx.result")

        val cpxOnly  =
          val nocpxSI = nocpx.siterator(cpx.key, cpx.key.ord.equiv)
          val tmp = 
            for 
              c <- cpx
              ns = nocpxSI.syncedWith(c)
              if ns.isEmpty 
            yield c
          try tmp.kProFile.saveAs(s"${prefix}-cpx-only.result")
          finally nocpxSI.close()

        val nocpxOnly =
          val cpxSI = cpx.siterator(nocpx.key, cpx.key.ord.equiv)
          val tmp = 
            for 
              n <- nocpx
              cs = cpxSI.syncedWith(n)
              if cs.isEmpty
            yield n
          try tmp.kProFile.saveAs(s"${prefix}-nocpx-only.result")
          finally cpxSI.close()

        println("")
        println(s"***        cpx: ${cpx.done { _.length }}")
        println(s"***      nocpx: ${nocpx.done { _.length }}")
        println(s"***   cpx excl: ${cpxOnly.done { _.length }}")
        println(s"*** nocpx excl: ${nocpxOnly.done { _.length }}")
      } 


      /** ProInfer requires the reference protein file to include
       *  the decoy proteins used for peptide-spectrum matching (PSM).
       *  The FASTA id of these decoy proteins is prefixed "DECOY_".
       *  Here is a quick routine to take a normal protein file and
       *  add decoys to it (decoys are generated by reversing proteins.)
       *
       *  @param proteinFile  A FASTA file of reference proteins
       *  @param DecoyFile    The resulting FASTA file with decoys added.
       *
       *  [[makeDecoys]] is intended also as commandline-executable.
       *  It can be run from the commandline like this:
       *  scala.bat proteomics.makedecoys IN OUT
       */

      def makeDecoys(proteinFile: String, decoyFile: String) =
        def mkDecoy(protein: FASTAEntry) =
          val seq = protein.seq.reverse
          val nam = "DECOY_" + protein.name
          protein.newSeq(seq).newName(nam)
        val db = ProteinDB(proteinFile)
        val refs = db.proteins
        val decoys = for p <- refs yield mkDecoy(p)
        FASTAFile.transientFASTAFile(refs ++ decoys).saveAs(decoyFile).close()
        refs.close()


      @main def makedecoys(args: String*): Unit = {
        val doc =
          """
          | MakeDecoys is a program to generate decoy proteins from
          | a set of reference proteins. It does this by reversing
          | each reference proteins.
          |
          | Invoke it like this:
          |
          |    scala makedecoys proteinFile outFile
          |
          | The meaning of the parameters are as follow:
          |
          |   proteinFile - FASTA file of reference proteins.
          |   outFile     - FASTA file of reference proteins and their decoys.
          | """.stripMargin

        val numArgs = args.length
        def sArg(p: Int, d: String) = if p < numArgs then args(p) else d 

        if args.contains("--help") then println(doc)
        else if numArgs < 2 then println("Not enough arguments")
        else
          val proteinFile = args(0)
          val outFile = sArg(1, "decoys-added-to-" + proteinFile)
          makeDecoys(proteinFile, outFile)
      }


      /** [[proinfer]] is intended as commandline-executable.
       *  It can be run from the commandline like this:
       *  scala.bat proteomics.proinfer PROT CPLX IN OUT Human RUN 0.999 0.01 
       */

      @main def proinfer(args: String*): Unit = {
        val doc  = 
          """ 
          | ProInfer is a protein assembly method which assists 
          | recovery of proteins supported only by ambiguous peptides.
          |
          | Invoke it like this:
          |
          |    scala proinfer \
          |      proteinFile complexFile peptideFile outFile \
          |      organism  prefix  threshold  qvalThreshold
          |
          | The meaning of the parameters are as follow:
          |
          |   proteinFile - A FASTA file of reference proteins and decoys.
          |       Reference proteins have ID of the form ">XX|Blah".
          |       Decoys hve ID of the form ">DECOY_XX|Blah".
          |   complexFile - Reference protein complexes, CORUM format.
          |   peptideFile - An initial PSM file.
          |   outFile     - Name of the output file of called proteins.
          |   organism    - Use only complexes of this organism (default: Human).
          |   prefix      - Prefix of files containing intermediate results.
          |   threshold   - Discard peptides if score < threshold (default: 0.999).
          |   qvalThreshold - Discard proteins if qval < qvalThreshold (default: 0.01).
          | """.stripMargin
          
        val numArgs = args.length
        def sArg(p: Int, d: String) = if p < numArgs then args(p) else d 
        def dArg(p: Int, r: Double) = if p < numArgs then args(p).toDouble else r 

        if args.contains("--help") then println(doc)
        else if numArgs < 3 then println("Not enough arguments")
        else
          val proteinFile = args(0)
          val complexFile = args(1)
          val peptideFile = args(2)
          val outFile = sArg(3, "called-proteins-for-" + peptideFile)
          val organism = sArg(4, ORGANISM)
          val prefix = sArg(5, "run-")
          val threshold = dArg(6, 0.999)
          val qvalThreshold = dArg(7, 0.01)
          val run = ProInfer(
            peptideFile, outFile, proteinFile, complexFile,
            organism, prefix, threshold, qvalThreshold)
          run.close()
      }
  
    end ProInfer
 
  end ProteinInference



/** Example *********************************************
 *
 *
{{{

   import proteomics.ProteinInference.ProInfer

//
// Test data
//

   def db(s: String) = "test-proinfer/" ++ s
   val REFPROTEINS  = db("human-proteins-including-decoys.fasta")
   val REFCOMPLEXES = db("complexes.txt")
   val INPUT        = db("DDA1.tsv")
   val OUTPUT       = "DDA1-called-proteins.tsv"

//
// Run ProInfer on test data above using default parameter values
//

   ProInfer(INPUT, OUTPUT, REFPROTEINS, REFCOMPLEXES)



//
// Run analysis
//

   ProInfer.runCpx(INPUT, "withcpx.tsv", REFPROTEINS, REFCOMPLEXES)

   ProInfer.runNoCpx(INPUT, "nocpx.tsv", REFPROTEINS)

   ProInfer.analysis("withcpx.tsv", "nocpx.tsv", "analysis-", 0.05)

}}}
 *
 *
 */


