//-----------------------------------------------
// Copyright 2016 Guangxi University
// Written by Liang Zhao(S080011@e.ntu.edu.sg)
// Released under the GPL
//-----------------------------------------------
//
// preprocess - prepare data files for error correction
// It's developed based on SGA, originally writen by Jared Simpson (js18@sanger.ac.uk)
//

#include <iostream>
#include <fstream>
#include <cstddef>
#include "Util.h"
#include "preprocess.h"
#include "Timer.h"
#include "SeqReader.h"
//#include "PrimerScreen.h"
#include "Alphabet.h"
#include "Quality.h"

//static unsigned int DEFAULT_MIN_LENGTH = 75;
static int LOW_QUALITY_PHRED_SCORE = 3;

//
// Getopt
//
#define SUBPROGRAM "preprocess"
static const char *PREPROCESS_VERSION_MESSAGE =
SUBPROGRAM " Version " PACKAGE_VERSION "\n";

static const char *PREPROCESS_USAGE_MESSAGE =
"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTION] READS1 READS2 ...\n"
"\n"
"    --help                      display this help and exit\n"
"    -o, --out=FILE              write the reads to FILE (default: stdout)\n"
"    -p, --pe-mode=INT           0 - do not treat reads as paired (default)\n"
"                                1 - reads are paired with the first read in READS1 and the second\n"
"                                read in READS2. The paired reads will be interleaved in the output file\n"
"                                2 - reads are paired and the records are interleaved within a single file.\n"
"    -f, --quality-filter=INT    discard the read if it contains more than INT low-quality bases.\n"
"                                Bases with phred score <= 3 are considered low quality. Default: no filtering.\n"
"                                The filtering is applied after trimming so bases removed are not counted.\n"
"                                Do not use this option if you are planning to use the BCR algorithm for indexing.\n"

"    --discard-quality           do not output quality scores\n"
"    --permute-ambiguous         Randomly change ambiguous base calls to one of possible bases.\n"
"                                If this option is not specified, the entire read will be discarded.\n"
"    --dust                      Perform dust-style filtering of low complexity reads.\n"
"    --dust-threshold=FLOAT      filter out reads that have a dust score higher than FLOAT (default: 4.0).\n"
"    --suffix=SUFFIX             append SUFFIX to each read ID\n"
"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";

enum QualityScaling
{
    QS_UNDEFINED,
    QS_NONE,
    QS_SANGER,
    QS_PHRED64
};

namespace opt
{
    static unsigned int seed = 0;
    static std::string outFile;
    static int qualityFilter = -1;
    static unsigned int peMode = 0;
    static bool bDiscardAmbiguous = true;
    static bool bDiscardQuality = false;
    static bool bDustFilter = false;
    static double dustThreshold = 4.0f;
    static std::string suffix;
    //static std::string orphanFile = "";
    //@lzhao
    static std::string outDim; // outFile dimension
}

static const char* shortopts = "o:p:f";

enum { OPT_HELP = 1, OPT_VERSION, OPT_PERMUTE,
       OPT_DUST, OPT_DUST_THRESHOLD, OPT_SUFFIX,
       OPT_DISCARD_QUALITY };

static const struct option longopts[] = {
    { "out",                    required_argument, NULL, 'o' },
    { "pe-mode",                required_argument, NULL, 'p' },
    { "quality-filter",         required_argument, NULL, 'f' },
    { "permute-ambiguous",      no_argument,       NULL, OPT_PERMUTE },
    { "discard-quality",        no_argument,       NULL, OPT_DISCARD_QUALITY },
    { "dust",                   no_argument,       NULL, OPT_DUST},
    { "dust-threshold",         required_argument, NULL, OPT_DUST_THRESHOLD },
    { "suffix",                 required_argument, NULL, OPT_SUFFIX },
    { "help",                   no_argument,       NULL, OPT_HELP },
    { "version",                no_argument,       NULL, OPT_VERSION },
    { NULL, 0, NULL, 0 }
};

static int64_t s_numReadsRead = 0;
static int64_t s_numReadsKept = 0;
static int64_t s_numBasesRead = 0;
static int64_t s_numBasesKept = 0;
static int64_t s_numInvalidPE = 0;
static int64_t s_numFailedDust = 0;

//
// Main
//
int preprocessMain(int argc, char** argv)
{
    Timer* pTimer = new Timer("mec preprocess");
    parsePreprocessOptions(argc, argv);

    // set random seed
    if (opt::seed == 0)
    {
        opt::seed = time(NULL);
    }

    std::cerr << "Parameters:\n";

    if(opt::qualityFilter >= 0)
        std::cerr << "QualFilter: at most " << opt::qualityFilter << " low quality bases\n";
    else
        std::cerr << "QualFilter: no filtering\n";

    std::cerr << "PE Mode: " << opt::peMode << "\n";
    std::cerr << "Outfile: " << (opt::outFile.empty() ? "stdout" : opt::outFile) << "\n";
    if(opt::bDiscardAmbiguous)
        std::cerr << "Discarding sequences with ambiguous bases\n";
    if(opt::bDiscardQuality)
        std::cerr << "Discarding quality scores\n";
    if(opt::bDustFilter)
        std::cerr << "Dust threshold: " << opt::dustThreshold << "\n";
    if(!opt::suffix.empty())
        std::cerr << "Suffix: " << opt::suffix << "\n";
    std::cerr << "Seed: " << opt::seed << "\n";
    // Seed the RNG
    srand(opt::seed);
    std::ostream* pWriter;
    //@--START lzhao--
    std::ostream* pDimWriter;
    //@--END lzhao--
    if(opt::outFile.empty())
    {
        pWriter = &std::cout;
        //@--START lzhao--
        pDimWriter = &std::cout;
        //@--END lzhao--
    }
    else
    {
        std::ostream* pFile = createWriter(opt::outFile);
        pWriter = pFile;

        //@--START lzhao--
        opt::outDim = opt::outFile + ".dim";
        std::ostream* pDim = createWriter(opt::outDim);
        pDimWriter = pDim;
        //@--END lzhao--
    }

    //@--START lzhao--
    size_t g_readLength = 0;
    size_t g_isChecked = 0;
    //@--END lzhao--

    if(opt::peMode == 0)
    {
        // Treat files as SE data
        while(optind < argc)
        {
            std::string filename = argv[optind++];
            std::cerr << "Processing " << filename << "\n\n";
            SeqReader reader(filename, SRF_NO_VALIDATION);
            SeqRecord record;

            while(reader.get(record))
            {
                //@--START lzhao--
                if (!g_isChecked) {
                    g_readLength = record.seq.length();
                    g_isChecked = 1;
                } else {
                    if (record.seq.length() != g_readLength) continue;
                }
                //@--END lzhao--

                bool passed = processRead(record);
                if(passed) // && samplePass())
                {
                    if(!opt::suffix.empty())
                        record.id.append(opt::suffix);

                    record.write(*pWriter);
                    ++s_numReadsKept;
                    s_numBasesKept += record.seq.length();
                }
            }
        }

    }
    else
    {
        assert(opt::peMode == 1 || opt::peMode == 2);
        int numFiles = argc - optind;
        if(opt::peMode == 1 && numFiles % 2 == 1)
        {
            std::cerr << "Error: An even number of files must be given for pe-mode 1\n";
            exit(EXIT_FAILURE);
        }

        while(optind < argc)
        {
            SeqReader* pReader1;
            SeqReader* pReader2;

            if(opt::peMode == 1)
            {
                // Read from separate files
                std::string filename1 = argv[optind++];
                std::string filename2 = argv[optind++];
                
                if(filename1 == "-" || filename2 == "-")
                {
                    std::cerr << "Reading from stdin is not supported in --pe-mode 1\n";
                    std::cerr << "Maybe you meant --pe-mode 2 (interleaved pairs?)\n";
                    exit(EXIT_FAILURE);
                }

                pReader1 = new SeqReader(filename1, SRF_NO_VALIDATION);
                pReader2 = new SeqReader(filename2, SRF_NO_VALIDATION);

                std::cerr << "Processing pe files " << filename1 << ", " << filename2 << "\n";

            }
            else
            {
                // Read from a single file
                std::string filename = argv[optind++];
                pReader1 = new SeqReader(filename, SRF_NO_VALIDATION);
                pReader2 = pReader1;
                std::cerr << "Processing interleaved pe file " << filename << "\n";
            }

            SeqRecord record1;
            SeqRecord record2;
            while(pReader1->get(record1) && pReader2->get(record2))
            {
                //@--START lzhao--
                if (!g_isChecked) {
                    g_readLength = record1.seq.length();
                    g_isChecked = 1;
                } else {
                    if ((record1.seq.length() != g_readLength) || (record2.seq.length() != g_readLength)) continue;
                }
                //@--END lzhao--

                // If the names of the records are the same, append a /1 and /2 to them
                if(record1.id == record2.id)
                {
                    if(!opt::suffix.empty())
                    {
                        record1.id.append(opt::suffix);
                        record2.id.append(opt::suffix);
                    }

                    record1.id.append("/1");
                    record2.id.append("/2");
                }

                // Ensure the read names are sensible
                std::string expectedID2 = getPairID(record1.id);
                std::string expectedID1 = getPairID(record2.id);

                if(expectedID1 != record1.id || expectedID2 != record2.id)
                {
                    std::cerr << "Warning: Pair IDs do not match (expected format /1,/2 or /A,/B)\n";
                    std::cerr << "Read1 ID: " << record1.id << "\n";
                    std::cerr << "Read2 ID: " << record2.id << "\n";
                    s_numInvalidPE += 2;
                }

                bool passed1 = processRead(record1);
                bool passed2 = processRead(record2);

                //if(!samplePass())
                //    continue;

                if(passed1 && passed2)
                {
                    record1.write(*pWriter);
                    record2.write(*pWriter);
                    s_numReadsKept += 2;
                    s_numBasesKept += record1.seq.length();
                    s_numBasesKept += record2.seq.length();
                }
            }

            if(pReader2 != pReader1)
            {
                // only delete reader2 if it is a distinct pointer
                delete pReader2;
                pReader2 = NULL;
            }
            delete pReader1;
            pReader1 = NULL;
        }
    }

    if(pWriter != &std::cout)
        delete pWriter;
    
    //@--START lzhao--
    (*pDimWriter) << s_numReadsKept << " " << g_readLength << "\n";
    if (pDimWriter != &std::cout)
        delete pDimWriter;
    //@--END lzhao--


    std::cerr << "\nPreprocess stats:\n";
    std::cerr << "Reads parsed:\t" << s_numReadsRead << "\n";
    std::cerr << "Reads kept:\t" << s_numReadsKept << " (" << (double)s_numReadsKept / (double)s_numReadsRead << ")\n";
    std::cerr << "Bases parsed:\t" << s_numBasesRead << "\n";
    std::cerr << "Bases kept:\t" << s_numBasesKept << " (" << (double)s_numBasesKept / (double)s_numBasesRead << ")\n";
    std::cerr << "Number of incorrectly paired reads that were discarded: " << s_numInvalidPE << "\n";
    if(opt::bDustFilter)
        std::cerr << "Number of reads failed dust filter: " << s_numFailedDust << "\n";
    delete pTimer;
    return 0;
}

// Process a single read by quality trimming, filtering
// returns true if the read should be kept
bool processRead(SeqRecord& record)
{
    // Check if the sequence has uncalled bases
    std::string seqStr = record.seq.toString();
    std::string qualStr = record.qual;

    ++s_numReadsRead;
    s_numBasesRead += seqStr.size();

    // If ambiguity codes are present in the sequence
    // and the user wants to keep them, we randomly
    // select one of the DNA symbols from the set of
    // possible bases
    if(!opt::bDiscardAmbiguous)
    {
        for(size_t i = 0; i < seqStr.size(); ++i)
        {
            // Convert '.' to 'N'
            if(seqStr[i] == '.')
                seqStr[i] = 'N';

            if(!IUPAC::isAmbiguous(seqStr[i]))
                continue;

            // Get the string of possible bases for this ambiguity code
            std::string possibles = IUPAC::getPossibleSymbols(seqStr[i]);

            // select one of the bases at random
            int j = rand() % possibles.size();
            seqStr[i] = possibles[j];
        }
    }

    // Ensure sequence is entirely ACGT
    size_t pos = seqStr.find_first_not_of("ACGT");
    if(pos != std::string::npos)
        return false;

    // Quality filter
    if(opt::qualityFilter >= 0 && !qualStr.empty())
    {
        int numLowQuality = countLowQuality(seqStr, qualStr);
        if(numLowQuality > opt::qualityFilter)
            return false;
    }

    // Dust filter
    if(opt::bDustFilter)
    {
        double dustScore = calculateDustScore(seqStr);
        bool bAcceptDust = dustScore < opt::dustThreshold;

        if(!bAcceptDust)
        {
            s_numFailedDust += 1;
            return false;
        }
    }

    record.seq = seqStr;

    if(opt::bDiscardQuality)
        record.qual.clear();
    else
        record.qual = qualStr;

    if(record.seq.length() == 0)// || record.seq.length() < opt::minLength)
        return false;

    return true;
}

// Count the number of low quality bases in the read
int countLowQuality(const std::string& seq, const std::string& qual)
{
    assert(seq.size() == qual.size());

    int sum = 0;
    for(size_t i = 0; i < seq.length(); ++i)
    {
        int ps = Quality::char2phred(qual[i]);
        if(ps <= LOW_QUALITY_PHRED_SCORE)
            ++sum;
    }
    return sum;
}

//
// Handle command line arguments
//
void parsePreprocessOptions(int argc, char** argv)
{
    bool die = false;
    for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)
    {
        std::istringstream arg(optarg != NULL ? optarg : "");
        switch (c)
        {
            case 'o': arg >> opt::outFile; break;
            case 'p': arg >> opt::peMode; break;
            case 'f': arg >> opt::qualityFilter; break;
            case OPT_DUST_THRESHOLD: arg >> opt::dustThreshold; opt::bDustFilter = true; break;
            case '?': die = true; break;
            case OPT_SUFFIX: arg >> opt::suffix; break;
            case OPT_PERMUTE: opt::bDiscardAmbiguous = false; break;
            case OPT_DUST: opt::bDustFilter = true; break;
            case OPT_DISCARD_QUALITY: opt::bDiscardQuality = true; break;
            case OPT_HELP:
                std::cout << PREPROCESS_USAGE_MESSAGE;
                exit(EXIT_SUCCESS);
            case OPT_VERSION:
                std::cout << PREPROCESS_VERSION_MESSAGE;
                exit(EXIT_SUCCESS);
        }
    }

    if (argc - optind < 1)
    {
        std::cerr << SUBPROGRAM ": missing arguments\n";
        die = true;
    }

    if (die)
    {
        std::cout << "\n" << PREPROCESS_USAGE_MESSAGE;
        exit(EXIT_FAILURE);
    }

    if(opt::peMode > 2)
    {
        std::cerr << SUBPROGRAM ": error pe-mode must be 0,1 or 2 (found: " << opt::peMode << ")\n";
        exit(EXIT_FAILURE);
    }
}
