/*

seven files:
-szmine_dataset.attribute2item
-szoutput_name.all.rulenum.txt
-szoutput_name.all.rules.txt
-szoutput_name.rep.rulenum.txt
-szoutput_name.rep.rules.txt
-szoutput_name.all.tidlist.txt
-szoutput_name.rep.tidlist.txt

one more:
szoutput_name.tgtvalues.txt

*/


#include <fstream>
#include <iostream>
#include <vector>
using namespace std;

#include <time.h>
#include <sys/timeb.h>

#include <algorithm>
#include "global.h"

/*
// for detecting memory leak
#include <afx.h>
#define new DEBUG_NEW
#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif
//*/

int gnum_of_holdout_exp_tests;
int gnum_of_mined_rules;
int gnum_of_holdout_valid_rules;
int gnum_of_holdout_BC_rules;
int gnum_of_holdout_BH_rules;


void read_convert(char *szcheck_dataset, vector<string> *pvec_attr_names, char* sztarget_attr, char* sztarget_value, map<string, int> *ptgtvalue_map, vector<int*> *converted, vector<int> *targets, char *szoutput_name);

void holdout(char *szmine_dataset, char *szcheck_dataset, char *szoutput_name, 
			 char* sztarget_attr, char* sztarget_value, 
			 int nmine_min_sup, int nmine_max_len, double dmine_pvalue, double dmine_local_pvalue, 
			 int neffect_size_method, double dmine_effect_size)
{
	char szrule_output_name[200], sznames_filename[200];
	char szrep_rule_output_name[200];
	vector<string> vec_attr_names;
	map<string, int> tgtvalue_map;
	SIGN_RULE_NUM thesign_rule_nums;

	gnum_of_classes = 2;

	sprintf(szrule_output_name, "%s.sorted", szoutput_name);
	sprintf(szrep_rule_output_name, "%s.rep", szoutput_name);

	// mine first dataset
	MineRules(szmine_dataset, sztarget_attr, sztarget_value, nmine_min_sup, nmine_max_len, 
		dmine_pvalue, dmine_local_pvalue, neffect_size_method, dmine_effect_size, -1, 0, SEEDING_GLOBAL, szoutput_name);

	// read mined rules
	ReadTreeStatis(szoutput_name);

	LoadSignRuleNums(szrule_output_name, &thesign_rule_nums);
	gnum_of_holdout_exp_tests = gnum_of_tests;

	if(sztarget_value[0]==0)
		LoadTgtValueMap(szoutput_name, &tgtvalue_map);

	//load the target values 
	int *ptgt_values = new int[gndb_size];
	LoadTgtValues(szoutput_name, ptgt_values);
	
	sprintf(sznames_filename, "%s.names", szmine_dataset);
	LoadAttrNames(sznames_filename, &vec_attr_names);

	// read and convert second dataset
	vector<int*> *converted = new vector<int*>();
	vector<int> *targets = new vector<int>();
	//read_convert(szcheck_dataset, converted, targets, szoutput_name);
	read_convert(szcheck_dataset, &vec_attr_names, sztarget_attr,  sztarget_value, &tgtvalue_map, converted, targets, szoutput_name);

	// output tgtvalues
	output_tgtvalues(szoutput_name, ptgt_values, gndb_size, targets);

	//gndb_size = targets->size();
	printf("#instances in training: %d\n", gndb_size);
	printf("#instances in testing: %d\n", targets->size());

	gpdfactorials = new double[targets->size()+1];
	InitFactorials(gpdfactorials, (int)targets->size());
	// holdout all rules
	holdout_rules(szrule_output_name, converted, (int)vec_attr_names.size(), targets, szoutput_name, dmine_pvalue);
	// holdout rep rules
	//holdout_rules(szrep_rule_output_name, converted, targets, szoutput_name, dmine_pvalue);

	// free space
	delete []gpdfactorials;

	for(int i=0; i<converted->size(); i++)
		delete [](*converted)[i];
	delete converted;
	delete targets;
	delete [] ptgt_values;
}

void output_tgtvalues(char *szoutput_name, int *ptgt_values, int num, std::vector<int> *targets)
{
	char sztgt_name[200];
	sprintf(sztgt_name, "%s.tgtvalues.txt", szoutput_name);
	std::ofstream tgt_out(sztgt_name, std::ios::out);

	tgt_out << (targets->size()+num) << " ";

	for(int i=0; i<num; i++)
		tgt_out << ptgt_values[i] << " ";
	for(int i=0; i<targets->size(); i++)
		tgt_out << targets->at(i) << " ";

	tgt_out << std::endl;
	tgt_out.close();
}

/*
void read_convert(char *szcheck_dataset, vector<vector<int> *> *converted, std::vector<int> *targets, char *szoutput_name)
{
	// load attribute item map
	std::cout << "load map..." << std::endl;
	char szattrvalue2item_filename[200];
	std::map<std::string, int> item_map;
	std::map<std::string, int>::iterator map_it;

	sprintf(szattrvalue2item_filename, "%s.attrvalue2item.txt", szoutput_name);
	LoadAttrValues2ItemMap(szattrvalue2item_filename, &item_map);

	// read dataset and convert
	char dataset[200];
	sprintf(dataset, "%s.data", szcheck_dataset);
	std::ifstream in(dataset, std::ios::in);

	char buf[200];
	while(!in.eof()) {
		std::string s;
		std::getline(in,s);
		if(s.length()==0) continue;
		std::vector<std::string> tokens;
		split(s,tokens, ',');

		std::vector<int> *tmp = new std::vector<int>();
		targets->push_back(atoi(tokens[0].c_str()));
		for(int i=1; i<tokens.size(); i++) {
			sprintf(buf, "attribute_%d=%d", i, atoi(tokens[i].c_str()));
			std::map<std::string,int>::iterator iter = item_map.find(buf);
			if(iter==item_map.end()) std::cout << buf << std::endl;
			else tmp->push_back(iter->second);	
		}

		converted->push_back(tmp);
	}

	in.close();
}
*/

void read_convert(char *szcheck_dataset, vector<string> *pvec_attr_names, char* sztarget_attr, char* sztarget_value, map<string, int> *ptgtvalue_map, vector<int*> *converted, vector<int> *targets, char *szoutput_name)
{
	// load attribute item map
	std::cout << "load map..." << std::endl;
	char szattrvalue2item_filename[200];
	map<string, int> item_map;
	map<string, int>::iterator iter, map_it;
	int *tmp, i;
	int num_of_clmns, nclmn_no;

	num_of_clmns = (int)pvec_attr_names->size();

	sprintf(szattrvalue2item_filename, "%s.attrvalue2item.txt", szoutput_name);
	LoadAttrValues2ItemMap(szattrvalue2item_filename, &item_map);

	// read dataset and convert
	char dataset[200];
	sprintf(dataset, "%s.data", szcheck_dataset);
	std::ifstream in(dataset, std::ios::in);

	char buf[200];
	while(!in.eof()) {
		std::string s;
		std::getline(in,s);
		if(s.length()==0) continue;
		std::vector<std::string> tokens;
		split(s,tokens, ',');

		if(num_of_clmns!=tokens.size())
			printf("Error: inconsistent number of columns\n");

		tmp = new int[num_of_clmns];
		nclmn_no = 0;
		for(i=0; i<tokens.size(); i++) 
		{			
			if(strcmp(tokens[i].c_str(), "?") && strcmp(tokens[i].c_str(), "ignore"))
			{
				if(strcmp(sztarget_attr, (*pvec_attr_names)[i].c_str())==0)
				{
					if(sztarget_value[0]==0)
					{
						map_it = ptgtvalue_map->find(tokens[i].c_str());
						if(map_it!=ptgtvalue_map->end())
							targets->push_back(map_it->second);
						else
						{
							printf("Warning: class %s does not occur in mining data\n", tokens[i].c_str());
							targets->push_back(-1);
						}
					}
					else if(strcmp(sztarget_value, tokens[i].c_str())==0)
						targets->push_back(1);
					else
						targets->push_back(0);
				}
				else
				{
					sprintf(buf, "%s=%s", (*pvec_attr_names)[i].c_str(), tokens[i].c_str());
					iter = item_map.find(buf);
					if(iter==item_map.end()) 
						cout << buf << endl;
					else 
						tmp[nclmn_no++] = iter->second;
				}
			}
		}

		qsort(tmp, nclmn_no, sizeof(int), comp_int);
		converted->push_back(tmp);
	}

	in.close();

}

void holdout_rules(char *szrule_output_name, vector<int *> *converted, int ntrans_len, vector<int> *targets, char *szoutput_name, double dvalid_pvalue)
{
		ASSOCRULE *pmined_rules;
		int *pmined_itemset_buf, *pmined_tidlist_buf;
		int num_of_mined_rules, nmax_mined_rule_len;
		SIGN_RULE_NUM thesign_rule_nums;

		num_of_mined_rules = LoadMinedRules(szrule_output_name, pmined_rules, pmined_itemset_buf, pmined_tidlist_buf, nmax_mined_rule_len);
		LoadMinedTidList(szoutput_name, pmined_rules, num_of_mined_rules);

		gnum_of_mined_rules = num_of_mined_rules;
		
		// valid through second dataset
		std::vector<ASSOCRULE *> *pvalid_rules = new std::vector<ASSOCRULE *>();
		cout << "before validation: " << num_of_mined_rules << std::endl;
		valid(gndb_size, pmined_rules, num_of_mined_rules, converted, ntrans_len, targets, pvalid_rules, gnmin_sup, dvalid_pvalue);
		cout << "after validation: " << pvalid_rules->size() << std::endl;

		sort(pvalid_rules->begin(), pvalid_rules->end(), compare);

		// recalculate
		recalculate(pvalid_rules, num_of_mined_rules, dvalid_pvalue, thesign_rule_nums);

		gnum_of_holdout_valid_rules = thesign_rule_nums.num_of_sign_rules;
		gnum_of_holdout_BC_rules = thesign_rule_nums.num_of_BCsign_rules;
		gnum_of_holdout_BH_rules = thesign_rule_nums.num_of_BHsign_rules;

		// output
		output_holdout_rules(pvalid_rules, szrule_output_name, thesign_rule_nums, num_of_mined_rules, dvalid_pvalue);

		//cout << "after output" << endl;

		// free space
		delete []pmined_rules;
		delete []pmined_itemset_buf;
		delete []pmined_tidlist_buf;

		for(int i=0; i<pvalid_rules->size(); i++) {
			delete [] pvalid_rules->at(i)->pattern;
			delete [] pvalid_rules->at(i)->ptid_list;
			delete pvalid_rules->at(i);
		}
		delete pvalid_rules;
		
}

bool IsSubset(int *pset1, int nlen1, int* pset2, int nlen2)
{
	int i, j;

	i = 0;
	j = 0;
	while(i<nlen1 && j<nlen2)
	{
		if(pset1[i]==pset2[j])
		{
			i++;
			j++;
		}
		else if(pset1[i]<pset2[j])
			return false;
		else 
			j++;
	}
	if(i<nlen1)
		return false;
	else
		return true;
}

void valid(int nfirst_dataset_size, ASSOCRULE *pmined_rules, int num_of_mined_rules, vector<int*>* converted, int ntrans_len, vector<int> *targets,  vector<ASSOCRULE *> *pvalid_rules, int nmin_sup, double dvalid_pvalue)
{
	int positive=0, *pclass_distr, i;
	bool is_support;

	std::cout << "no. of instances: " << targets->size() << std::endl;

	pclass_distr = NULL;

	if(gnum_of_classes==2)
	{
		for(i=0; i<targets->size(); i++)
			if(targets->at(i)==1) positive++;
		std::cout << "no. of positive: " << positive << std::endl;
	}
	else
	{
		pclass_distr = new int[gnum_of_classes];
		memset(pclass_distr, 0, sizeof(int)*gnum_of_classes);
		for(i=0; i<targets->size(); i++)
			pclass_distr[targets->at(i)]++;
		cout << "class frequency: ";
		for(i=0;i<gnum_of_classes;i++)
			cout << pclass_distr[i] << " ";
		cout << endl;

	}

	// validation
	std::cout << "validation..." << std::endl;
	for(i=0; i<num_of_mined_rules; i++) {
		int coverage=0, support=0;
		ASSOCRULE *tmp = (pmined_rules+i);
		std::vector<int> ids;
		for(int k=0; k<converted->size(); k++) {
			is_support = IsSubset(tmp->pattern, tmp->npat_len, (*converted)[k], ntrans_len);
			
			if(is_support) {
				ids.push_back(k);
				coverage++;
				if(gnum_of_classes==2)
				{
					if(targets->at(k)==1) 
						support++;
				}
				else if(targets->at(k)==tmp->nclass_no)
					support++;
			}
		}

		double p_value;
		//p_value = CalcFisherPvalue((int)targets->size(), coverage, positive, support);
		if(gnum_of_classes==2)
			p_value = CalcTwoTailedFisherPvalue((int)targets->size(), coverage, positive, support);
		else
			p_value = CalcTwoTailedFisherPvalue((int)targets->size(), coverage, pclass_distr[tmp->nclass_no], support);
		if(coverage+pmined_rules[i].nsup >=gnmine_min_sup && p_value <= dvalid_pvalue && 
			(double)(support+pmined_rules[i].ntgt_sup)/(coverage+pmined_rules[i].nsup)>=gdmine_min_conf) {
			ASSOCRULE *copy = new ASSOCRULE();
			copy->dadjusted_pvalue = tmp->dadjusted_pvalue;
			copy->dcond_pvalue = tmp->dcond_pvalue;
			copy->dpvalue = p_value;
			copy->dscore = tmp->dscore;
			copy->npat_len = tmp->npat_len;
			copy->pattern = new int[copy->npat_len];
			for(int index=0; index<copy->npat_len; index++)
				copy->pattern[index] = tmp->pattern[index];
			copy->npreorder = tmp->npreorder;


			copy->ptid_list = new int[tmp->nsup+coverage];
			for(int s=0; s<tmp->nsup;s++) {
				copy->ptid_list[s] = tmp->ptid_list[s];
			}
			for(int s=0; s<coverage; s++) {
				copy->ptid_list[s+tmp->nsup] = nfirst_dataset_size+ids[s];
			}
			copy->nsup = tmp->nsup+coverage;
			copy->nclass_no = tmp->nclass_no;
			copy->ntgt_sup = tmp->ntgt_sup+support;
			copy->dpvalue = p_value;

			pvalid_rules->push_back(copy);
		}
	}

	if(gnum_of_classes>2)
		delete []pclass_distr;

}

void recalculate(std::vector<ASSOCRULE *> *pvalid_rules, int num_of_mined_rules, double dvalid_pvalue, SIGN_RULE_NUM &thesign_rule_nums)
{
	std::vector<double> pvalues;
	for(int i=0; i<pvalid_rules->size(); i++)
		pvalues.push_back(pvalid_rules->at(i)->dpvalue);
	std::sort(pvalues.begin(),pvalues.end());


	thesign_rule_nums.num_of_sign_rules = (int)pvalues.size();
	thesign_rule_nums.num_of_BCsign_rules = 0;
	thesign_rule_nums.num_of_BHsign_rules = 0;

	thesign_rule_nums.dBH_pvalue_thres = 0;

	double bf_thres = dvalid_pvalue/num_of_mined_rules;


	for(int i=0; i<pvalues.size();i++) {
		if(pvalues[i]<=bf_thres) {
			thesign_rule_nums.num_of_BCsign_rules = (i+1);
		}
		else
			break;
	}

	for(int i=0; i<pvalues.size();i++) {
		if(pvalues[i]<=bf_thres*(i+1)) {
			thesign_rule_nums.num_of_BHsign_rules = (i+1);
			thesign_rule_nums.dBH_pvalue_thres = pvalues[i];
		}
		else
			break;
	}
	

	thesign_rule_nums.dpermBH_pvalue_thres = 0;
	thesign_rule_nums.num_of_perm_BCsign_rules = 0;
	thesign_rule_nums.num_of_perm_BHsign_rules = 0;
	thesign_rule_nums.num_of_perm_sign_rules = 0;
	thesign_rule_nums.num_of_permFWER_rules = 0;

}

void output_holdout_rules(std::vector<ASSOCRULE *> *pvalid_rules, char *szoutput_name, SIGN_RULE_NUM &thesign_rule_nums, int num_of_mined_rules, double dmine_pvalue)
{
	char sztidlist_name[200], szrules_name[200], szrulenum_name[200];
	sprintf(sztidlist_name, "%s.tidlist.txt", szoutput_name);
	sprintf(szrules_name, "%s.rules.txt", szoutput_name);
	sprintf(szrulenum_name, "%s.rulenum.txt", szoutput_name);

	std::ofstream tid_out(sztidlist_name, std::ios::out);
	std::ofstream rule_out(szrules_name, std::ios::out);

	for(int i=0; i<pvalid_rules->size(); i++) {
		ASSOCRULE *tmp = pvalid_rules->at(i);
		// rule
		rule_out << tmp->npat_len << " ";
		for(int k=0; k<tmp->npat_len; k++)
			rule_out << tmp->pattern[k] << " ";
		if(gnum_of_classes>2)
			rule_out << tmp->nclass_no << " ";
		rule_out << 0.0 << " ";
		rule_out << tmp->dpvalue << " ";
		rule_out << 0.0 << " ";
		rule_out << 0.0 << " ";
		rule_out << std::endl;
		
		// tid
		tid_out << tmp->nsup << " " << tmp->ntgt_sup << " ";
		for(int k=0; k<tmp->nsup; k++)
			tid_out << tmp->ptid_list[k] << " ";
		tid_out << std::endl;
	}
	rule_out.close();
	tid_out.close();

	std::ofstream num_out(szrulenum_name, std::ios::out);
	num_out << num_of_mined_rules << std::endl;
	num_out << dmine_pvalue << std::endl;
	num_out << thesign_rule_nums.dBH_pvalue_thres << std::endl;
	num_out << 0.0 << std::endl;
	num_out << 0.0 << std::endl;
	num_out << thesign_rule_nums.num_of_sign_rules << std::endl;
	num_out << thesign_rule_nums.num_of_BCsign_rules << std::endl;
	num_out << thesign_rule_nums.num_of_BHsign_rules << std::endl;
	num_out << 0 << std::endl;
	num_out << 0 << std::endl;
	num_out << 0 << std::endl;
	num_out << 0 << std::endl;

	num_out.close();

}


void copy(ASSOCRULE *dest, ASSOCRULE *src)
{
	dest->dadjusted_pvalue = src->dadjusted_pvalue;
	dest->dcond_pvalue = src->dcond_pvalue;
	dest->dpvalue = src->dpvalue;
	dest->dscore = src->dscore;
	dest->npat_len = src->npat_len;
	dest->npreorder = src->npreorder;
	dest->nsup = src->nsup;
	dest->ntgt_sup = src->ntgt_sup;
	dest->nclass_no = src->nclass_no;

	dest->pattern = new int[dest->npat_len];
	dest->ptid_list = new int[dest->nsup];

	for(int i=0; i<dest->npat_len; i++)
		dest->pattern[i] = src->pattern[i];
	for(int i=0; i<dest->nsup; i++)
		dest->ptid_list[i] = src->ptid_list[i];
}

bool compare(ASSOCRULE *first, ASSOCRULE *second)
{
	return (first->dpvalue < second->dpvalue) ? true:false;
}

void split(string s, vector<string > &tokens, char delim)
{
	int len = (int)s.length();
	char *buf = new char[len];
	int i, k=0;

	for(i=0; i<len; i++) 
	{
		if(s.at(i)==delim) 
		{
			if(k!=0) 
			{
				buf[k]='\0';
				tokens.push_back(buf);
			}
			k=0;
			continue;
		}
		if(k>0 || s[i]!=' ')
			buf[k++]=s[i];
	}

	if(k!=0) 
	{
		buf[k]='\0';
		tokens.push_back(buf);
	}

	delete [] buf;

}

void RandomHoldout(char *szdataset_name,  
			 char* sztarget_attr, char* sztarget_value, 
			 int nmine_min_sup, int nmine_max_len, double dmine_pvalue, 
			 double dmine_effect_size, char *szoutput_name)
{

	FILE *fp;

	char szpart[200], szmine_dataset[200], szcheck_dataset[200], szrule_output_name[200];
	vector<int> mapping;
	struct timeb start, end;
	double drandom_holdout_time;
	SIGN_RULE_NUM thesign_rule_nums;

	ftime(&start);
	sprintf(szpart, "%s_part", szdataset_name);
	partition(szdataset_name, szpart, mapping);
	sprintf(szpart, "%s_part_0",szdataset_name);
	copy_namefile(szdataset_name, szpart);

	// try hold out
	sprintf(szpart,"%s_part", szdataset_name);
	sprintf(szmine_dataset, "%s_%d", szpart, 0);
	sprintf(szcheck_dataset, "%s_%d", szpart, 1);

	holdout(szmine_dataset, szcheck_dataset,  szoutput_name, sztarget_attr, sztarget_value, 
		nmine_min_sup, nmine_max_len, dmine_pvalue, 0.05, CONFIDENCE, dmine_effect_size);


	sprintf(szrule_output_name, "%s.sorted", szoutput_name);
	LoadSignRuleNums(szrule_output_name, &thesign_rule_nums);

	ftime(&end);
	drandom_holdout_time = end.time-start.time+(double)(end.millitm-start.millitm)/1000;

	printf("#significant rules at %.2E level: %d\n", thesign_rule_nums.dpvalue_thres, thesign_rule_nums.num_of_sign_rules);
	printf("#significant rules at %.2E level: %d\n", thesign_rule_nums.dBC_pvalue_thres, thesign_rule_nums.num_of_BCsign_rules);
	printf("#significant rules at FDR %.2E: %d (%.3E)\n", thesign_rule_nums.dpvalue_thres, thesign_rule_nums.num_of_BHsign_rules, thesign_rule_nums.dBH_pvalue_thres);
	printf("\n");

	fp = fopen("Holdout.sum.txt", "a+");
	if(fp==NULL)
		printf("Error: cannot open file holdout.sum.txt for appending\n");
	else
	{
		fprintf(fp, "RANDOM_HOLDOUT %s\t", szdataset_name);
		fprintf(fp, "%d %d %.4f %.4f\t", nmine_min_sup, nmine_max_len, dmine_pvalue, 0.05);
		fprintf(fp, "CONFIDENCE %.3f\t", dmine_effect_size);
		fprintf(fp, "%.4f\t", drandom_holdout_time);
		fprintf(fp, "%d %d %d %d\t", gnum_of_mined_rules, gnum_of_holdout_valid_rules, gnum_of_holdout_BC_rules, gnum_of_holdout_BH_rules);
		fprintf(fp, "%.3E %.3E %.3E\t", thesign_rule_nums.dpvalue_thres, thesign_rule_nums.dBC_pvalue_thres, thesign_rule_nums.dBH_pvalue_thres);
		fprintf(fp, "%d %d %d\t", thesign_rule_nums.num_of_sign_rules, thesign_rule_nums.num_of_BCsign_rules, thesign_rule_nums.num_of_BHsign_rules);
		fprintf(fp, "\n");

		fclose(fp);
	}

}
