

package proteomics

/** Version 11.1, to be used with dbmodel.DBFile Version 11.1.
 *
 *  Wong Limsoon
 *  14 May 2023
 */


  object FASTAModel:
 
    /** A simple model to turn FASTA files into ordered collection. 
     *  Ordered collection comes with Synchrony iterator, which provides
     *  efficient general synchronized iteration over multiple collections.
     */

    import scala.language.implicitConversions
    import dbmodel.Synchronizable.CBI
    import dbmodel.OrderedCollection.{ OColl, Key }
    import dbmodel.DBFile.{ OFile, OTransient, PARSER, FORMATTER }
    import dbmodel.DBFile.TMP    // Default folder to use.


    type Bool = Boolean
    type META = Map[String,Any]
    type ID   = Int



    case class FASTAEntry(id: ID, name: String, seq: String, meta: META):

      /** Update fields
       */

      def newId(id: ID): FASTAEntry = FASTAEntry(id, name, seq, meta)

      def newName(name: String): FASTAEntry = FASTAEntry(id, name, seq, meta)

      def newSeq(seq: String): FASTAEntry = FASTAEntry(id, name, seq, meta)

      def newMeta(meta: META): FASTAEntry = FASTAEntry(id, name, seq, meta)

      def newMeta(k: String, v: Any): FASTAEntry = (k, v) match
        case ("id", v: ID)       => newId(v)
        case ("name", v: String) => newName(v)
        case ("seq", v: String)  => newSeq(v)
        case _ => newMeta(meta + (k -> v))

      def ++(fields: (String, Any)*): FASTAEntry =
        val kv = Map(fields: _*)
        val id = kv.getOrElse("id", this.id).asInstanceOf[ID]
        val nm = kv.getOrElse("name", this.name).asInstanceOf[String]
        val sq = kv.getOrElse("seq", this.seq).asInstanceOf[String]
        val mi = kv -- Vector("id", "name", "seq")
        FASTAEntry(id, nm, sq, meta ++ mi)

      def ++(meta: META): FASTAEntry = newMeta(this.meta ++ meta)


      /** Check whether a field is present
       */

      def hasMeta(k: String): Bool = meta.contains(k)

      def checkMeta[A](k: String, check: A => Bool, error: Bool = false): Bool =
        try check(getMeta(k)) 
        catch { case _: Throwable => error }


      /** Retrieve a field
       */

      def apply(k: String): Any     = getMeta[Any](k)

      def getInt(k: String): Int    = getMeta[Int](k)

      def getDbl(k: String): Double = getMeta[Double](k)

      def getStr(k: String): String = getMeta[String](k)

      def getMeta[A](k: String): A  =
        val v = meta.get(k) match
          case Some(a) => a
          case None    => k match
            case "id"   => id
            case "name" => name
            case "seq"  => seq
            case _      => throw new java.util.NoSuchElementException()
        v.asInstanceOf[A]


      /** Delete and replace fields
       */

      def delMeta(k: String): FASTAEntry = newMeta(meta - k)

      def delMeta(): FASTAEntry = newMeta(meta.empty)


    object FASTAEntry:

      /** Alternative constructor for [[FASTAEntry]], autofilling some fields.
       */

      def apply(name: String, seq: String, meta: META = Map()): FASTAEntry =
        FASTAEntry(0, name, seq, meta)

    end FASTAEntry

    
    type FASTAITERATOR = CBI[FASTAEntry]
    type FASTAFILE = OFile[FASTAEntry,ID]
    type FASTA[K] = OFile[FASTAEntry,K]

    /** FASTA file re not self-describing. Schemas are a way for users
     *  to tell which fields correspond to what. An entry "f -> get" in a
     *  schema vector says the field name is "f", and a string representing
     *  the value of "f" can be parsed using the funtion "get".
     */
    type SCHEMA = Vector[(String, String => Any)]


    object FASTAFile:
 
      /** Default key for FASTAFile
       */

      val fastaFileKey: Key[FASTAEntry,ID] = Key(_.id, Ordering[Int])

      /** Constructors
       */

      def apply(filename: String): FASTAFILE = FASTAFile(filename, TMP)

      def apply(filename: String, folder: String): FASTAFILE =
        // Assume filename is a FASTA file
        val fname = OFile.mkFileName(filename, folder).toString
        OFile(fastaFileKey, fname, parser, formatter)
 
      def customized(
            filename: String,
            folder: String,
            parser: PARSER[FASTAEntry] = parser,
            formatter: FORMATTER[FASTAEntry] = formatter): FASTAFILE =
        val fname = OFile.mkFileName(filename, folder).toString
        OFile(fastaFileKey, fname, parser, formatter)

      /** Sometimes, it is handy to have a transient FASTA file for
       *  temporary use without writing it to disk.
       */

      def transientFASTAFile(entries: IterableOnce[FASTAEntry]): FASTAFILE = 
        OTransient[FASTAEntry,ID](
          fastaFileKey, CBI(entries),
          parser, formatter, OFile.destructorOFile)

      def emptyFASTAFile: FASTAFILE = 
        OTransient[FASTAEntry,ID](
          fastaFileKey, CBI(),
          parser, formatter, OFile.destructorOFile _)

      /** Formatter for FASTA file. 
       *  Normal FASTA file does not use explicit field name. 
       *  A header is used here to introduce field name.
       */

      val formatter: FORMATTER[FASTAEntry] = { (options: String) =>

        var labels: Seq[String] = null

        def typeOf(e: Any): String = e match
          case _: Int    => "Int"
          case _: Double => "Double"
          case _ => "String"

        def header(b: FASTAEntry) = 
          if labels.isEmpty then ""
          else
            s"; ##wonglimsoon@nus##\t" +
              labels.map(l => s"${l}:${typeOf(b.meta(l))}").mkString("\t") +
              "\n"

        def entry(b: FASTAEntry) =
          val nm = b.name
          val sp = if (labels.isEmpty) "" else "\t"
          val lb = labels.map(l => b.meta(l).toString).mkString("\t")
          val sq = b.seq.grouped(80).mkString("\n")
          s">${nm}${sp}${lb}\n${sq}\n"

        def format(b: FASTAEntry, position: Int = 1): String =
          if position == 0 then
            // 1st entry. Write header if needed.
            labels = b.meta.keys.toSeq.sorted
            s"${header(b)}${entry(b)}"
          else
            // Other entries. No need header.
            entry(b)

        format
      }


      /** Parser for [[FASTAEntry]]. 
       */

      private val getInt = (e: String) => e.toInt
      private val getDbl = (e: String) => e.toDouble
      private val getStr = (e: String) => e
      private val getAny = (e: String) => 
        try e.toInt catch { case _: Throwable =>
          try e.toDouble catch { case _: Throwable => e }
        }

      private val cvt = Map("Int"->getInt, "Double"->getDbl, "String"->getStr)
      private val emptySchema: Vector[(String, String => Any)] = null


      /** Parser for standard FASTA file
       */

      val stdParser: PARSER[FASTAEntry] = parser(emptySchema) 

  
      /** General schema-dependent parser
       */

      def parser: PARSER[FASTAEntry] = parser(emptySchema)

      def parser(sch: SCHEMA): PARSER[FASTAEntry] = { (option: String) =>
        var schema: SCHEMA = sch
        var buf: String = ""

        def parse(it: Iterator[String], position: Int = 1): FASTAEntry = {
          var line = ""
          var parsed: FASTAEntry = null
          var continue = (buf != "") || it.hasNext

          while continue do {
            if buf == "" then line = it.next() else { line = buf; buf = "" }

            if line startsWith "; ##wonglimsoon@nus##" then
              // This line is a header. Define new schema.
              val e = line.split("\t")
              val f = e.toVector.tail
              schema = 
                for 
                  kv <- f
                  k = kv.split(":") 
                yield 
                  k(0) -> cvt(k(1))
              continue = it.hasNext

            else if line startsWith ";" then
              // This line is a comment. Skip.
              continue = it.hasNext

            else if line startsWith ">" then {
              // This line is an entry title.
              val hl = line.split("\t")
              val mx = hl.length - 1
              val nm = hl(0).drop(1)
              val mi =
                val fvs = 
                  if schema == null then
                    for i <- 1 to mx
                    yield i.toString -> getAny(hl(i))
                  else
                    for 
                      i <- 1 to mx
                      (l, get) = schema(i - 1)
                    yield 
                      l -> get(hl(i))
                fvs.toMap

              var sq: List[String]   = List()
              var more = it.hasNext

              while more do
                val tmp = it.next()
                val end = (tmp startsWith ";") || (tmp startsWith ">") || 
                           (tmp.length==0)
                if end then buf = tmp // end of sequence reached.
                else sq = tmp +: sq
                more = (!end) && it.hasNext
            
              parsed   = FASTAEntry(position, nm, sq.reverse.mkString, mi)
              continue = false
            }

            else
              // Blank line. Skip.
              continue = it.hasNext
          }

          if parsed != null then return parsed
          else throw new java.io.EOFException("")
        }

        parse
      }

    end FASTAFile


    /** Conversion from [[OColl]] to [[FASTA]]
     */

    extension [K](entries: OColl[FASTAEntry,K])
      def toFASTAFile: FASTA[K] = 
        FASTAFile.transientFASTAFile(entries.cbi).assumeOrderedBy(entries.key)

  end FASTAModel
 


/** Examples **********************************************
 *
{{{

   import proteomics.FASTAModel.FASTAFile
   import scala.language.implicitConversions

   val fasta = {
     val fname = "test-proinfer/human-proteins-including-decoys.fasta"
     FASTAFile(fname)
   }

   fasta(9)

   fasta(8).seq

   fasta(7).name

   fasta.length

   val wls = fasta.map(s => s.newMeta("len", s.seq.length)).serialize("wls")

   wls(8)("len")

   wls.protection(false).close()


}}}
 *
 */




