
#include <map>
#include <unordered_map>
#include <boost/algorithm/string.hpp>
#include <vector>
#include <mutex>
#include <chrono>
#include <iostream>
#include <fstream>
#include "ThreadPool.h"
#include "evidence.hpp"
#include "external.hpp"
#include "functions.hpp"
#include "statistics.hpp"

using namespace std;

map <int,int> variants_rejected;
map <int,int> variants_homozygous;
map <string,int> samples_homozygous;
map <int,int> variants_heterozygous;
map <string,int> samples_heterozygous;
map <int,int> variants_missing;
map <string,int> samples_missing;
map <int,int> variants_double_missing;
map <string,int> samples_double_missing;
map <string,int> sample_list;
int total_samples = 0;
int total_variants = 0;
int total_missing = 0;
vector <int> variants;
vector <string> samples;

void check_statistics()
{
    v_vcf = v_input;
    v_out = v_output;
    
    screen_message (screen_size, 0, "", 1, 0);
    screen_message (screen_size, 0, Program_name + "::statistics" , 1, v_quiet);
    screen_message (screen_size, 2, "Loading and processing ...", 2, v_quiet);
    vector <string> head;
    
    ThreadPool pool(v_threads);
    std::vector< std::future<int> > results;
    string line;
    ifstream myfile (v_vcf);
    if (myfile.is_open())
    {
        int count = 0;
        int sample_index = 0;
        int format_index = 0;
        int filter_index = 0;
        int info_index = 0;
        
        
        count++;
        
        while ( getline (myfile,line) )
        {
            if (line.substr(0,2) == "##") {head.push_back(line);continue;}
            if (line.substr(0,2) == "#C")
            {
                vector <string> data;
                boost::split(data,line,boost::is_any_of("\t"));
                int vindex = 0;
                for(auto &&item: data)
                {
                    if (item == "FORMAT") {format_index = vindex; sample_index = vindex+1;}
                    if (item == "FILTER") {filter_index = vindex;}
                    if (item == "INFO") {info_index = vindex;}
                    vindex++;
                }
                boost::split(samples,line,boost::is_any_of("\t"));
                head.push_back(line);continue;
            }
            
            total_samples = samples.size() - format_index - 1;
            
            for (int a = format_index+1; a < samples.size(); a++)
            {
                sample_list[samples[a]] = 1;
            }
            
            if (line == "") {continue;}
            count_total++;
            
            vector <string> check;
            boost::split(check,line,boost::is_any_of("\t"));

            int start_int = 0;
            if (vstart != "0")
            {
                try {
                    start_int = stoi(vstart);
                }
                catch (const std::exception& e) {
                    start_int = 0;
                }
            }
            
            int end_int = 0;
            if (vend != "0")
            {
                try {
                    end_int = stoi(vend);
                }
                catch (const std::exception& e) {
                    end_int = 0;
                }
            }
            
            
            if (v_chr != "")
            {
                if (v_chr != check[0]){continue;}
            }
            if (start_int > 0)
            {
                if (stoi(check[1]) < start_int){continue;}
            }
            if (end_int > 0)
            {
                if (stoi(check[1]) > end_int) {break;}
            }
            variants.push_back(stoi(check[1]));
            check.clear();
 
            
            int sample_size = samples.size();
            count = count + 1;
            total_variants++;
            results.emplace_back(
         pool.enqueue([count, line, format_index, info_index, filter_index, sample_size]
                      {
                          
                          vector <string> data;
                          boost::split(data,line,boost::is_any_of("\t"));
                          
                          string format = data[format_index];
                          vector <string> format_data;
                          boost::split(format_data,format,boost::is_any_of(":"));
                          int vindex = 0;
                          int GT = -1;
                          
                          for(auto &&item: format_data)
                          {
                              if (item == "GT") {GT = vindex;}
                              vindex++;
                          }
                          
                          if (GT == -1)
                          {
                              mtx.lock();
                              variants_rejected[stoi(data[1])]=1;
                              mtx.unlock();
                              return 1;
                          }
                          

                          int missing = 0;
                          int double_missing = 0;
                          int homozygous = 0;
                          int heterozygous = 0;
                          
                          for (int a = format_index+1; a < data.size(); a++)
                          {
                              vector <string> fields;
                              boost::split(fields,data[a],boost::is_any_of(":"));
                              string gt_value = fields[GT];
                              
                              if (gt_value == "") {missing++;missing++;double_missing++;return 1;}
                              if (gt_value == ".") {missing++;missing++;double_missing++;return 1;}
                              
                              vector <string> gt_decom;
                              boost::split(gt_decom,gt_value,boost::is_any_of("/"));
                              if (gt_decom.size() == 1) {boost::split(gt_decom,gt_value,boost::is_any_of("|"));}
                              
                              //homozygous
                              if ((gt_decom[0] == gt_decom[1]) && (gt_decom[0] != "."))
                              {
                                  homozygous++;
                                  samples_homozygous[samples[a]]++;
                              }
                              
                              // heterozugous
                               if (((gt_decom[0] != gt_decom[1]) && (gt_decom[0] != ".")) && (gt_decom[1] != "."))
                               {
                                   heterozygous++;
                                   samples_heterozygous[samples[a]]++;
                               }
                              
                              if ((gt_decom[0] == ".") && (gt_decom[1] == "."))
                              {
                                  double_missing++;
                                  samples_double_missing[samples[a]]++;
                              }
                              
                              if (gt_decom[0] == ".")
                              {
                                  missing++;
                                  total_missing++;
                                  mtx.lock();
                                  samples_missing[samples[a]]++;
                                  mtx.unlock();
                              }
                              if (gt_decom[1] == ".")
                              {
                                  missing++;
                                  total_missing++;
                                  mtx.lock();
                                  samples_missing[samples[a]]++;
                                  mtx.unlock();
                              }
                              
                          }
                          
                          mtx.lock();
                          variants_homozygous[stoi(data[1])] = homozygous;
                          variants_heterozygous[stoi(data[1])] = heterozygous;
                          variants_missing[stoi(data[1])] = missing;
                          variants_double_missing[stoi(data[1])] = double_missing;
                          mtx.unlock();
                          return stoi(data[1]);
                      })
         );
            
        }
        myfile.close();
    }
    else {cout << "error";return;}
    
    for(auto && result: results){result.get();} // waiting for all threads
    
    screen_message (screen_size, 2, "Loading and processing ... done", 1, v_quiet);
    
    ofstream out;
    out.open (v_out);
    out.close();
    out.open(v_out, std::ios_base::app);
    
    out << "vcfx::statistics" << endl;
    out << "General info" << endl;
    if (vstart != "") {out << "Starting point: " << vstart << endl;}
    if (vend != "") {out << "Ending point: " << vend << endl;}
    out << "Number of samples: " + to_string(total_samples) << endl;
    out << "Number of variants: " + to_string(total_variants) << endl;
    out << "Number of missing alleles: " + to_string(total_missing) << endl;
    out << endl;
    out << "Statistics per variant" << endl;
    out << "Variant\tHomozygous\tHeterozygous\tmissings\tdouble-missings" << endl;

    for(auto &&dado: variants)
    {
        out << dado;
        out << "\t" << variants_homozygous[dado];
        out << "\t" << variants_heterozygous[dado];
        out << "\t" << variants_missing[dado];
        out << "\t" << variants_double_missing[dado];
        out << endl;
    }
    
    out << endl;
    out << "Statistics per sample" << endl;
    out << "Sample\tHomozygous\tHeterozygous\tmissings\tdouble-missings" << endl;
    for(auto &&dado: sample_list)
    {
        out << dado.first << "\t" << to_string(dado.second);
        out << "\t" << samples_heterozygous[dado.first];
        out << "\t" << samples_missing[dado.first];
        out << "\t" << samples_double_missing[dado.first];
        if (samples_heterozygous[dado.first] == 0) {out << "\twarning:monomorphic";}
        out << endl;
    }
    
    out.close();
    
}


void help_statistics ()
{
    screen_message (screen_size, 0, "", 1, 0);
    screen_message (screen_size, 0, Program_name + "::statistics" , 1, 0);
    screen_message (screen_size, 0, "", 1, 0);
    screen_message (screen_size, 2, "* Author  : " + Program_author, 1, 0);
    screen_message (screen_size, 2, "* Contact : " + Program_contact, 1, 0);
    screen_message (screen_size, 2, "* Version : " + Program_version, 1, 0);
    screen_message (screen_size, 0, "", 1, 0);
    screen_message (screen_size, 2, "Options", 1, 0);
    screen_message (screen_size, 5, "input      the input VCF file [mandatory]", 1, 0);
    screen_message (screen_size, 5, "output     the statistics file to be created", 1, 0);
    screen_message (screen_size, 5, "threads    number of additional threads (default: " + to_string(v_threads) + ")", 1, 0);
    screen_message (screen_size, 5, "--quiet    quiet mode", 1, 0);
    screen_message (screen_size, 0, "", 1, 0);
    PrintWarnings();
    return;
}

void main_statistics ()
{
    if (! fileExists(v_input)) {warnings.push_back("The input file could not be found.");help_statistics();return;}
    if (v_output == "") {v_output = GetFileNameWithoutExtension (v_input) + ".stat.txt";}
    check_statistics();
    return;
}
