
#include <map>
#include <unordered_map>
#include <boost/algorithm/string.hpp>
#include <vector>
#include <mutex>
#include <chrono>
#include <iostream>
#include <fstream>
#include "ThreadPool.h"
#include "evidence.hpp"
#include "external.hpp"
#include "functions.hpp"

using namespace std;

map <int,string> vcf_data;
//map <int,string> vcf_data_missing;
map <int,string> vcf_data_rejected;
map <int,string> vcf_data_mono;

void check_variants()
{
    v_vcf = v_input;
    v_out = v_output;

    
    screen_message (screen_size, 0, "", 1, 0);
    screen_message (screen_size, 0, Program_name + "::evidence" , 1, v_quiet);
    
    map <int,string> reference_data;
    if (v_reference != "") {
        screen_message (screen_size, 2, "Loading reference ...", 2, v_quiet);
        string line;
        ifstream ref (v_reference);
        if (ref.is_open())
        {
            while ( getline (ref,line) )
            {
                if (line.substr(0,1) == "#"){continue;}
                vector <string> ref_data;
                boost::split(ref_data,line,boost::is_any_of("\t"));
                reference_data[stoi(ref_data[1])] = ref_data[0] + "\t" + ref_data[1] + "\t" + ref_data[2] + "\t" + ref_data[3] + "\t" + ref_data[4];
            }
        }
        ref.close();
        screen_message (screen_size, 2, "Loading reference ... done", 1, v_quiet);
    }
    
    
    
    
    
    
    
    string sample_line = "";
    
    ofstream out;
    out.open (v_out);
    out.close();
    out.open(v_out, std::ios_base::app);
    
    ofstream reject;
    reject.open (v_out.substr(0,v_out.size()-3) + "rejected.vcf");
    reject.close();
    reject.open(v_out.substr(0,v_out.size()-3) + "rejected.vcf", std::ios_base::app);
    
    ofstream mono;
    mono.open (v_out.substr(0,v_out.size()-3) + "monomorphic.vcf");
    mono.close();
    mono.open(v_out.substr(0,v_out.size()-3) + "monomorphic.vcf", std::ios_base::app);
    

    screen_message (screen_size, 2, "Loading and processing ...", 2, v_quiet);
    vector <string> head;
    vector <string> head_add;
    
    
    head_add.push_back("##FILTER=<ID=PASS,Description=\"Approved based on evidence\">");
    head_add.push_back("##FILTER=<ID=STR,Description=\"It looks like an STR\">");
    head_add.push_back("##FILTER=<ID=MISS,Description=\"Too many missings\">");
    head_add.push_back("##FILTER=<ID=WARN,Description=\"Approved based on evidence, high missing alleles\">");

    head_add.push_back("##INFO=<ID=SINGLETON,Number=A,Type=Integer,Description=\"Singleton status for each variant: (0)false (1)true\">");
    head_add.push_back("##INFO=<ID=EVIDENCE,Number=A,Type=Integer,Description=\"Number of samples presenting this allele in homozygous state or balanced heterozygous above threshold\">");
    head_add.push_back("##INFO=<ID=HOMOZYGOUS,Number=A,Type=Integer,Description=\"Number of homozygous for each allele\">");
    head_add.push_back("##INFO=<ID=FREQ,Number=A,Type=Float,Description=\"Frequency for each variant\">");
    head_add.push_back("##INFO=<ID=MISS,Number=1,Type=Float,Description=\"Missing frequency\">");
    head_add.push_back("##INFO=<ID=REF,Number=A,Type=Integer,Description=\"Compatibility with the reference\">");

    
    

    
    ThreadPool pool(v_threads);
    std::vector< std::future<int> > results;
    string line;
    ifstream myfile (v_vcf);
    if (myfile.is_open())
    {
        int count = 0;
        int sample_index = 0;
        int format_index = 0;
        int filter_index = 0;
        int info_index = 0;
        
        vector <string> samples;
        int buffer_count = 0;
        count++;
        
        while ( getline (myfile,line) )
        {
            if (line.substr(0,2) == "##")
            {
                if (line.substr(0,6) == "##INFO") {continue;}
                if (line.substr(0,8) == "##FILTER") {continue;}
                head.push_back(line);
                continue;
            }
            if (line.substr(0,2) == "#C")
            {
                sample_line = line;
                vector <string> data;
                boost::split(data,line,boost::is_any_of("\t"));
                int vindex = 0;
                for(auto &&item: data)
                {
                    if (item == "FORMAT") {format_index = vindex; sample_index = vindex+1;}
                    if (item == "FILTER") {filter_index = vindex;}
                    if (item == "INFO") {info_index = vindex;}
                    vindex++;
                }
                for (int a = sample_index; a < data.size(); a++) {samples.push_back(data[a]);}
                continue;
            }
            
            if (line == "") {continue;}
            
            
            vector <string> check;
            boost::split(check,line,boost::is_any_of("\t"));
            
            
            
            int str = 0;
            vector <string> altalleles;
            boost::split(altalleles,check[4],boost::is_any_of(","));
            altalleles.push_back(check[3]);
            if (altalleles.size() > 5) {
                int n = altalleles.size();
                int sizes[n];
                for (int i = 0; i < n; i++)
                {
                    sizes[i] = altalleles[i].size();
                }
                n = sizeof(sizes)/sizeof(sizes[0]);
                if (checkIsSTR(sizes, n)) {str = 1;}
            }
            
            
            
            int start_int = 0;
            if (vstart != "0")
            {
                try {
                    start_int = stoi(vstart);
                }
                catch (const std::exception& e) {
                    start_int = 0;
                }
            }
            
            int end_int = 0;
            if (vend != "0")
            {
                try {
                    end_int = stoi(vend);
                }
                catch (const std::exception& e) {
                    end_int = 0;
                }
            }
            
            
            if (v_chr != "")
            {
                if (v_chr != check[0]){continue;}
            }
            if (start_int > 0)
            {
                if (stoi(check[1]) < start_int){continue;}
            }
            if (end_int > 0)
            {
                if (stoi(check[1]) > end_int) {break;}
            }
            
            
            string comp_reference = "";
            if ( reference_data.find(stoi(check[1])) != reference_data.end() )
            {
                vector <string> ref_data;
                boost::split(ref_data,reference_data[stoi(check[1])],boost::is_any_of("\t"));

                string ref_ref_allele = ref_data[3];
                string ref_alt_alleles = ref_data[4];
                boost::split(ref_data,ref_alt_alleles,boost::is_any_of(","));
                map <string,int> refs;
                refs[ref_ref_allele] = 1;
                for (auto &item:ref_data)
                {
                    refs[item] = 1;
                }
                
                
                string current_ref_allele = check[3];
                if (refs.find(current_ref_allele) != refs.end()) {comp_reference = ",1";}
                else {comp_reference = ",0";}
                
                string current_alt_alleles = check[4];
                vector <string> current_data;
                boost::split(current_data,current_alt_alleles,boost::is_any_of(","));
                for (auto &item : current_data)
                {
                    if (refs.find(item) != refs.end()) {comp_reference = comp_reference + ",1";}
                    else {comp_reference = comp_reference + ",0";}
                }
            }
            
 
            
            check.clear();

            count_total++;

            buffer_count++;
            
            if (buffer_count == buffer)
            {
                for(auto && result: results){result.get(); } // waiting for all threads
                
                for(auto &&dado: head){out << dado << endl;}
                for(auto &&dado: head_add){out << dado << endl;}
                if (sample_line != "") {out << sample_line << endl;}
                for(auto &&dado: vcf_data){out << dado.second << endl;}
                
                for(auto &&dado: head){reject << dado << endl;}
                for(auto &&dado: head_add){reject << dado << endl;}
                if (sample_line != "") {reject << sample_line << endl;}
                for(auto &&dado: vcf_data_rejected){reject << dado.second << endl;}
                
                for(auto &&dado: head){mono << dado << endl;}head.clear();
                for(auto &&dado: head_add){mono << dado << endl;}head_add.clear();
                if (sample_line != "") {mono << sample_line << endl;} sample_line = "";
                for(auto &&dado: vcf_data_mono){mono << dado.second << endl;}
                
                
                results.clear();
                vcf_data.clear();
                vcf_data_rejected.clear();
                vcf_data_mono.clear();
                buffer_count = 0;
                screen_message (screen_size, 2, "Loading and processing ... " + to_string(count) + " variants done ...", 2, v_quiet);

            }
            
            int sample_size = samples.size();
            count = count + 1;
            results.emplace_back(
         pool.enqueue([count, line, format_index, info_index, filter_index, sample_size, str, comp_reference]
                      {
                          vector <string> data;
                          boost::split(data,line,boost::is_any_of("\t"));
                          
                          string format = data[format_index];
                          vector <string> format_data;
                          boost::split(format_data,format,boost::is_any_of(":"));
                          int vindex = 0;
                          int GT = -1;
                          int AD = -1;
                          int DP = -1;
                          
                          for(auto &&item: format_data)
                          {
                              if (item == "GT") {GT = vindex;}
                              if (item == "AD") {AD = vindex;}
                              if (item == "DP") {DP = vindex;}
                              vindex++;
                          }
                          
                          if (((GT == -1) || (AD == -1)) || (DP == -1))
                          {
                              mtx.lock();
                              vcf_data_rejected[stoi(data[1])] = line;
                              count_rejected++;
                              mtx.unlock();
                              return 1;
                          }
                          
                          
                          int missing = 0;
                          map <string,int> homozygous;
                          map <string,int> evidence;
                          map <string,int> allele_frequency;
                          string info = "";
                          string pass = "";
                          
                          
                          
                          for (int a = format_index+1; a < data.size(); a++)
                          {
                              vector <string> fields;
                              boost::split(fields,data[a],boost::is_any_of(":"));
                              string gt_value = fields[GT];
                              string ad_value = fields[AD];
                              string dp_value = fields[DP];
                              
                              
                              if ((gt_value == "") || (gt_value == "./.")) {missing++;missing++;continue;}
                              if (((ad_value == "") || (ad_value == ".")) || (ad_value == "0")) {missing++;missing++;continue;}
                              if (((dp_value == "") || (dp_value == ".")) || (dp_value == "0")) {missing++;missing++;continue;}
                              
                              
                              vector <string> gt_decom;
                              boost::split(gt_decom,gt_value,boost::is_any_of("/"));
                              if (gt_decom.size() == 1){boost::split(gt_decom,gt_value,boost::is_any_of("|"));}
                              if (gt_decom.size() != 2) {continue;}
                              string alleleA = gt_decom[0];
                              string alleleB = gt_decom[1];
                              

                              
                              vector <string> ad_decom;
                              boost::split(ad_decom,ad_value,boost::is_any_of(","));
                              
                              if ((alleleA == ".") || (alleleA == "")) {missing++;};
                              if ((alleleB == ".") || (alleleB == "")) {missing++;};
                              if (alleleA != ".") {allele_frequency[alleleA]++;}
                              if (alleleB != ".") {allele_frequency[alleleB]++;}
                              if ((alleleA == ".") || (alleleB == ".")) {continue;}
                              
                              
                              //homozygous
                              if (alleleA == alleleB)
                              {
                                  if ((dp_value == ".") || (dp_value == "")){continue;}
                                  if ((ad_decom[stoi(alleleA)] == ".") || (ad_decom[stoi(alleleA)] == "")) {continue;}
                                  homozygous[alleleA]++;
                                  if (( stof( ad_decom[stoi(alleleA)] ) / stof(dp_value)) == 1)
                                  {
                                      if (stoi(dp_value) >= min_cov) {
                                          evidence[alleleA]++;
                                      }
                                      continue;
                                  }
                              }
                              
                              
                              
                              
                              // heterozygous
                              if (alleleA != alleleB)
                              {
                                  
                                  
                                  if ((( stof(ad_decom[stoi(alleleA)] ) / stof(dp_value)) >= min_proportion) )
                                  {
                                      if ((stoi(dp_value) >= min_cov) && (( stoi(ad_decom[stoi(alleleA)]) + stoi(ad_decom[stoi(alleleB)])) == stoi(dp_value)))
                                      {
                                          evidence[alleleA]++;
                                      }
                                  }

                                  if ((( stof(ad_decom[stoi(alleleB)] ) / stof(dp_value)) >= min_proportion) )
                                  {
                                      if ((stoi(dp_value) >= min_cov) && (( stoi(ad_decom[stoi(alleleA)]) + stoi(ad_decom[stoi(alleleB)])) == stoi(dp_value)))
                                      {
                                          evidence[alleleB]++;
                                      }
                                      continue;
                                  }
                                  
                                  /*
                                  if ((( stof(ad_decom[stoi(alleleA)] ) / stof(dp_value)) >= min_proportion) &&  (( stof(ad_decom[stoi(alleleA)] ) / stof(dp_value)) <= max_proportion))
                                  {
                                      if ((stoi(dp_value) >= min_cov) && (( stoi(ad_decom[stoi(alleleA)]) + stoi(ad_decom[stoi(alleleB)])) == stoi(dp_value)))
                                      {
                                          evidence[alleleA]++;
                                      }
                                  }
                                  
                                  if ((( stof(ad_decom[stoi(alleleB)] ) / stof(dp_value)) >= min_proportion) &&  (( stof(ad_decom[stoi(alleleB)] ) / stof(dp_value)) <= max_proportion))
                                  {
                                      if ((stoi(dp_value) >= min_cov) && (( stoi(ad_decom[stoi(alleleA)]) + stoi(ad_decom[stoi(alleleB)])) == stoi(dp_value)))
                                      {
                                          evidence[alleleB]++;
                                      }
                                      continue;
                                  }
                                   */
                                  }
                              
                          }
                          
                          
                          // to many missings
                          int reject_missing = 0;
                          if ((float(missing) / (float(sample_size)*2)) > max_missing_per_variant)
                          {reject_missing=1;count_missing++;}
                          
                          
                          string alternatives = data[4];
                          vector <string> alts;
                          boost::split(alts,alternatives,boost::is_any_of(","));
                          if (alts.size() == 0) {alts[0] = data[4];}

                          
                          
                          // Frequency info
                          int allele_count = 0;
                          int reject_monomorfic = 0;
                          string freqs;
                          for (int a = 0; a <= alts.size(); a++)
                          {
                              float freq_value = float(allele_frequency[to_string(a)]) / (float(sample_size)*2);
                              freqs = freqs + "," + to_string(freq_value);
                              if (float(allele_frequency[to_string(a)]) > 0) {allele_count++;}
                          }
                          string frequency_line = "FREQ=" + freqs.substr(1);
                          
                          
                          // missing info
                          float freq_value = float(missing) / (float(sample_size)*2);
                          string missing_line = "MISS=" + to_string(freq_value);
                          
                          
                          
                          if (allele_count == 1)
                          {
                              if (allele_frequency["0"] == 0) {reject_monomorfic=2;}
                              
                              if (allele_frequency["0"] > 0)
                              {
                                  reject_monomorfic=1;
                                  mtx.lock();
                                  vcf_data_mono[stoi(data[1])] = line;
                                  mtx.unlock();
                              }
                              count_monomorfic++;
                          }
  
                          
                          
                          
                          
                          // singleton
                          string sing_found = "";
                          for (int a = 0; a <= alts.size(); a++)
                          {
                              if (float(allele_frequency[to_string(a)]) == 1)
                              {
                                  sing_found = sing_found + ",1";
                                  count_singleton++;
                              }
                              else {
                                  sing_found = sing_found + ",0";
                              }
                          }
                          string singleton_line = "SINGLETON=" + sing_found.substr(1);

                          
  
                          
                          //evidence
                          int evidence_count = 0;
                          string evid_found = "";
                          for (int a = 0; a <= alts.size(); a++)
                          {
                              evid_found = evid_found + "," + to_string(evidence[to_string(a)]);
                              if (a != 0) {
                                  if (float(evidence[to_string(a)]) >= 1)
                                  {
                                      evidence_count++;
                                  }
                              }
                          }
                          string evidence_line;
                          evidence_line = "EVIDENCE=" + evid_found.substr(1);
                          
 
                          
                          //homozygous
                          string homo_found = "";
 //                         int homo_count = 0;
                          for (int a = 0; a <= alts.size(); a++)
                          {
                              homo_found = homo_found + "," + to_string(homozygous[to_string(a)]);
   //                           if (a != 0) {
     //                             if (float(homozygous[to_string(a)]) >= 1) {homo_count++;}
       //                       }
                          }
                          string homozygous_line;
                          homozygous_line = "HOMOZYGOUS=" + homo_found.substr(1);
                          
                          
   
                          
                          string infoline = singleton_line + ";" + evidence_line + ";" + homozygous_line + ";" + frequency_line + ";" + missing_line;
                          if (comp_reference != "") {infoline = infoline + ";REF=" + comp_reference.substr(1);}

                          
                          
                          string filter = ".";
                          string old_filter = data[filter_index];

                          
                          int pass_help = 0;
                          int warn_help = 0;

 //                       if ((homo_count >= 1) && (reject_missing == 1)) {filter="WARN";warn_help = 1;}
                        if ((evidence_count >= 1) && (reject_missing == 1)) {filter="WARN";warn_help = 1;}
                        if ((evidence_count == 0) && (reject_missing == 1)) {filter="MISS";pass_help = 0;}
                        if ((evidence_count >= 1) && (reject_missing == 0)) {filter="PASS";pass_help = 1;}
 //                       if ((homo_count >= 1) && (reject_missing == 0)) {filter="PASS";pass_help = 1;}
//                          if ((evidence_count != (alts.size()+1)) && (str == 1)){filter="STR";}
                        if (reject_monomorfic == 2) {filter="PASS";count_mono_alt++;}
                        if (pass_help == 1) {count_pass++;}
                        if (warn_help == 1) {count_warn++;}

                          
                          if (keep_info == 1)
                          {
                              data[info_index] = infoline + ";" + data[info_index];
                          }
                          if (keep_info == 0)
                          {
                              data[info_index] = infoline;
                          }

                          data[filter_index] = filter;
                          
                          mtx.lock();
                          string newline = data[0];
                          for (int b = 1; b < data.size(); b++)
                          {
                              newline = newline + "\t" + data[b];
                          }
 //                         if (reject_missing == 1) {vcf_data_missing[stoi(data[1])] = newline;}
                          if (reject_monomorfic != 1) {vcf_data[stoi(data[1])] = newline;}
                          
                          mtx.unlock();
                          
                          newline = "";
                          data.clear();
                          homozygous.clear();
                          evidence.clear();
                          allele_frequency.clear();
                          info = "";
                          pass = "";
                          return stoi(data[1]);
                      })
         );
            
        }
        myfile.close();
    }
    else {cout << "error";return;}
    
    for(auto && result: results){result.get();} // waiting for all threads
    
    
    for(auto &&dado: head){out << dado << endl;}
    for(auto &&dado: head_add){out << dado << endl;}
    if (sample_line != "") {out << sample_line << endl;}
    for(auto &&dado: vcf_data){out << dado.second << endl;}
    out.close();
    
//    for(auto &&dado: head){miss << dado << endl;}
//    for(auto &&dado: head_add){miss << dado << endl;}
//    if (sample_line != "") {miss << sample_line << endl;}
//    for(auto &&dado: vcf_data_missing){miss << dado.second << endl;}
//    miss.close();
    
    for(auto &&dado: head){reject << dado << endl;}
    for(auto &&dado: head_add){reject << dado << endl;}
    if (sample_line != "") {reject << sample_line << endl;}
    for(auto &&dado: vcf_data_rejected){reject << dado.second << endl;}
    reject.close();
    
    for(auto &&dado: head){mono << dado << endl;}
    for(auto &&dado: head_add){mono << dado << endl;}
    if (sample_line != "") {mono << sample_line << endl;}
    for(auto &&dado: vcf_data_mono){mono << dado.second << endl;}
    mono.close();
    screen_message (screen_size, 2, "Loading and processing ... done", 1, v_quiet);
    screen_message (screen_size, 2, "Number of variants                              : " + to_string(count_total), 1, v_quiet);
    screen_message (screen_size, 2, "Variants annotated with PASS                    : " + to_string(count_pass), 1, v_quiet);
    screen_message (screen_size, 2, "Variants annotated with WARN                    : " + to_string(count_warn), 1, v_quiet);
    screen_message (screen_size, 2, "Variants annotated with MISS                    : " + to_string(count_missing), 1, v_quiet);

    screen_message (screen_size, 2, "Monomorphic for reference (removed)             : " + to_string(count_monomorfic), 1, v_quiet);


    screen_message (screen_size, 2, "Number of Singletons                            : " + to_string(count_singleton), 1, v_quiet);
    screen_message (screen_size, 2, "Rejected variants (removed)                     : " + to_string(count_rejected), 1, v_quiet);

}


void help_evidence ()
{
    screen_message (screen_size, 0, "", 1, 0);
    screen_message (screen_size, 0, Program_name + "::evidence" , 1, 0);
    screen_message (screen_size, 0, "", 1, 0);
    screen_message (screen_size, 2, "* Author  : " + Program_author, 1, 0);
    screen_message (screen_size, 2, "* Contact : " + Program_contact, 1, 0);
    screen_message (screen_size, 2, "* Version : " + Program_version, 1, 0);
    screen_message (screen_size, 0, "", 1, 0);
    screen_message (screen_size, 2, "Options", 1, 0);
    screen_message (screen_size, 5, "input      the input VCF file", 1, 0);
    screen_message (screen_size, 5, "output     the VCF file to be created", 1, 0);
    screen_message (screen_size, 5, "chr        the chromosome to be considered", 1, 0);
    screen_message (screen_size, 5, "start      start processing from this position", 1, 0);
    screen_message (screen_size, 5, "end        process variants to this position", 1, 0);
    screen_message (screen_size, 5, "reference  VCF with known variants" , 1, 0);
    screen_message (screen_size, 5, "mincov     minimal depth of coverage (default: " + to_string(min_cov) + ")" , 1, 0);
    screen_message (screen_size, 5, "minprop    min depth proportion to consider (default: " + to_string(min_proportion) + ")", 1, 0);
 //   screen_message (screen_size, 5, "maxprop    max depth proportion to consider (default: " + to_string(max_proportion) + ")", 1, 0);
    screen_message (screen_size, 5, "maxmiss    max missing allele proportion per variant (default: " + to_string(max_missing_per_variant) + ")", 1, 0);
    screen_message (screen_size, 5, "buffer     buffer size (default: " + to_string(buffer) + " variants)", 1, 0);
    screen_message (screen_size, 5, "threads    number of additional threads (default: " + to_string(v_threads) + ")", 1, 0);
    screen_message (screen_size, 5, "--quiet    quiet mode", 1, 0);
    screen_message (screen_size, 5, "--info     keep original info data and the new one", 1, 0);
    screen_message (screen_size, 0, "", 1, 0);
    PrintWarnings();

    return;
    
}

void main_evidence ()
{
    if (! fileExists(v_input)) {warnings.push_back("The input file could not be found.");help_evidence();return;}
    if (v_output == "") {v_output = GetFileNameWithoutExtension (v_input) + ".evid.vcf";}
    check_variants();
    return;
}
