// This file creates a tree from the haplotypes
// Then it generates the haplotypes from the tree and compare the
// haplotypes to the input haplotypes to see if there are the same

#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include "String.h"

int *sortcount;
Data *LabelSet;

// calculate the leaf-counts for each columns
int *caculate_leaf_count(Graph S) {
	int *count = new int[S.numEdge];
	int i,j,temp;

	for(i=0;i<S.numEdge;i++)
		count[i] = 0;

	for(i=0;i<S.numEdge;i++)
		for(j=0;j<S.numPath;j++) {
			temp = (int)S.content[j*S.numEdge+i] - 48;
			
			if(temp == 1)
				temp = 2;
			else if(temp == 2)
				temp = 1;

			count[i] = count[i]+temp;
		}

	return count;
}

int compare (const void * a, const void * b)
{
  return ( *(int*)a - *(int*)b );
}

// Add the edge into the path if its corresponding content is 1
Data *create_R1(Graph S) {
	Data *R1 = newData();
	int i,j;
	char *path;

	for(i=0;i<S.numPath;i++)
		for(j=0;j<S.numEdge;j++) {
			if(S.content[i*S.numEdge+j]=='1') {
				path = search_string(S.Path,i);
				R1 = appendData(R1,path);
				break;
			}
		}

	return R1;
}

DataA *ResultA;
Data *extra_edges = newData();

// Add the edges into the paths
void create_T1(Data *R2,Graph S,int whichone) {
	int i,j,pos,len = DataLength(R2);
	Data *sort;
	DataA *pointto;
	char *value;
	int no = -1;
	
	for(i=0;i<len;i++) {
		no = -1;
		sort = newData();
		pos = search_pos(S.Path,R2->value);
		for(j=0;j<S.numEdge;j++) {
			if(S.content[pos*S.numEdge+j]=='1' && no!=sortcount[j]) {				
				value = search_string(S.Edge,j);
				sort = appendData(sort,value);
				no = sortcount[j];
			}
		}
		pointto = locate_DataA(ResultA,pos);
		delete_data(pointto->row);
		pointto->row = copyData(sort);
		
		delete_data(sort);
		R2 = R2->next;
	}
}

void replace_node(tree T1,char *value,char *node) {
	int i,pos = search_pos(T1.edge,value);

	for(i=0;i<2*pos;i++) 
		T1.node = T1.node->next;
	
	T1.node->value = node;
}

// Merge path together if they have the same edges
tree Merge_Path(tree T1, int len) {
	Data *hresult;
	int l,i,j,pos;
	char *node;
	DataA *pointto;
	
	for(i=0;i<len;i++) {
		pointto = locate_DataA(ResultA,i);
		hresult = pointto->row;
		
		l = DataLength(hresult);
		for(j=0;j<l;j++) {
			
			if(j==0)
				node = "root";
			
			pos = search_pos(T1.edge,hresult->value);
			if(pos==-1) {
				T1.edge = appendData(T1.edge,hresult->value);
				T1.node = appendData(T1.node,node);
				T1.node = appendData(T1.node,hresult->value);
			} else {
				if(j!=0) {
					replace_node(T1,hresult->value,node);
				}
			}

			node = hresult->value;
			hresult = hresult->next;
		}
	}
	return T1;
}

int *leaves_record;

// sort the graph according to the leaf-counts
Graph sort_graph(Graph S, int *count) {
	int i,j,k;
	int *scount = new int[S.numEdge];
	sortcount = new int[S.numEdge];
	Graph G;
	char *value;
	
	G.content = new char[S.numEdge*S.numPath+1];
	G.numEdge = S.numEdge;
	G.numPath = S.numPath;
	G.Edge = newData();
	G.Path = newData();
		
	for(i=0;i<S.numEdge;i++)
		scount[i] = count[i];

	qsort(scount,S.numEdge,sizeof(int),compare);
	
	for(i=0;i<S.numEdge;i++) 
		sortcount[i] = scount[S.numEdge-i-1];
		
	G.Path = copyData(S.Path);
	
	for(i=0;i<S.numEdge;i++)
		for(j=0;j<S.numEdge;j++) {
			if(sortcount[i] == count[j]) {
				count[j] = -1;
				value = search_string(S.Edge,j);
				G.Edge = appendData(G.Edge,value);
				
				for(k=0;k<S.numPath;k++)
					G.content[k*S.numEdge+i] = S.content[k*S.numEdge+j]; 
				break;
			}
		}
	delete [] scount;
	return G;
}

// find identical columns
Graph handle_duplicate(Graph S) {
	int i,j,k;
	int count;
	char *previous;
	
	Data *original = copyData(S.Edge);
	
	for(i=0;i<S.numPath;i++) {
		count = -1;	
	
		for(j=0;j<S.numEdge;j++) {
			if(S.content[i*S.numEdge+j]=='1') {
				if(sortcount[j]==count) {
					Data *hedge = S.Edge;
				
					k = 0;
					while(strcmp(hedge->value,"END")!=0) {
						
						if(sortcount[k]==count && S.content[i*S.numEdge+k]=='1') 
							hedge->value = previous;
							
						hedge = hedge->next;	
						k++;
					}
				}
				count = sortcount[j];
				previous = search_string(S.Edge,j);
			}
		}
	}
	
	Graph L;
	Data *he,*hEdge;
	int enumber = 0;
	char *value;

	// move same edges together
	L.numEdge = S.numEdge;
	L.numPath = S.numPath;
	L.Path = copyData(S.Path);
	L.Edge = newData();
	
	L.content = new char[L.numEdge*L.numPath];
	hEdge = S.Edge;
	
	for(i=0;i<S.numEdge;i++) {
		if(strcmp(hEdge->value,"cc")!=0) {
			value = hEdge->value;
			he = hEdge;
					
			for(j=i;j<S.numEdge;j++) {
				if(strcmp(he->value,value)==0) {
					char *vv = search_string(original,j);
					
					LabelSet = appendData(LabelSet,vv);
					L.Edge = appendData(L.Edge,he->value);
					for(k=0;k<S.numPath;k++) {
						L.content[k*S.numEdge+enumber] = S.content[k*S.numEdge+j];
					}
					he->value = "cc";
					enumber++;
				}
				he = he->next;
			}
			LabelSet = appendData(LabelSet,"|");
		}	
		hEdge = hEdge->next;
	}
	
	L.content[L.numEdge*L.numPath] = '\0';
	
	delete_graph(S);
	return L;
}

// read input haplotype data
Graph init(char *filename)
{
	Graph L;
	Data *phead,*ehead;
	FILE *fp;
	fp = fopen(filename,"r");
	int row,column,i,digit;
	float frow,fcolumn;
	char *temp;
	
	ehead = newData();
	phead = newData();
	
	if(fp==NULL) {
		printf("Can not open %s\n",filename);
		exit(1);
	}
		
	fscanf(fp,"%f",&frow);
	fscanf(fp,"%f",&fcolumn);
	
	row = (int)frow;
	column = (int)fcolumn;
	
	L.content = new char[row*column+1];

	for(i=1;i<=row;i++) {
		temp = generate_node("p",i);
		phead = appendData(phead,temp);
	}
	
	for(i=1;i<=column;i++) {
		temp = generate_node("e",i);
		ehead = appendData(ehead,temp);
	}

	int index = 0;
	while(fscanf(fp,"%d",&digit)!=EOF) {
		L.content[index] = (char)(digit+48);
		index++;
	}		
	L.content[index] = '\0';
			
	L.numEdge = DataLength(ehead);
	L.numPath = DataLength(phead);

	L.Edge = ehead;
	L.Path = phead;
	
	fclose(fp);
	
	return L;
}

void write_to_file(Data *PathSet,Graph S, Graph G, char *filename,char *vfilename,char *cvector) {
	FILE *fp;
	int i,len;
	Data *hedge,*hpath;
	char *title = "Paths: ";
	
	fp = fopen(filename,"w");
	if(fp==NULL)
		printf("Can not open file %s\n",filename);
	
	fprintf(fp,"%s",title);
	
	len = DataLength(PathSet);
	for(i=0;i<len;i++) {
		fprintf(fp,"%s ",PathSet->value);
		PathSet = PathSet->next;
	}
	
	fprintf(fp,"\n\n");
	
	fprintf(fp,"Graph1:\n");
	fprintf(fp,"%d ",G.numEdge);
	fprintf(fp,"%d\n",G.numPath);
	hedge = G.Edge;
	
	for(i=0;i<G.numEdge;i++) {
		fprintf(fp,"%s ",hedge->value);
		hedge = hedge->next;
	}
	fprintf(fp,"\n");
	
	hpath = G.Path;
	
	for(i=0;i<G.numPath;i++) {
		fprintf(fp,"%s ",hpath->value);
		hpath = hpath->next;
	}
	fprintf(fp,"\n");
	
	fprintf(fp,"%s\n\n",G.content);
	
	////////////////////////////////////////////
	fprintf(fp,"Graph2:\n");
	fprintf(fp,"%d ",S.numEdge);
	fprintf(fp,"%d\n",S.numPath);
	hedge = S.Edge;
	
	for(i=0;i<S.numEdge;i++) {
		fprintf(fp,"%s ",hedge->value);
		hedge = hedge->next;
	}
	fprintf(fp,"\n");
	
	hpath = S.Path;
	
	for(i=0;i<S.numPath;i++) {
		fprintf(fp,"%s ",hpath->value);
		hpath = hpath->next;
	}
	fprintf(fp,"\n");
	
	fprintf(fp,"%s\n\n",S.content);
	
	fprintf(fp,"Vector:\n");
	fprintf(fp,"%s ",vfilename);
	fprintf(fp,"%s\n\n",cvector);
	
	fprintf(fp,"Labels: ");
	
	Data *hlabel = LabelSet;
	len = DataLength(LabelSet);
	for(i=0;i<len;i++) {
		fprintf(fp,"%s ",hlabel->value);
		hlabel = hlabel->next;
	}
	
	fprintf(fp,"\n\n");
	
	fclose(fp);
}

char *generate_default(char *cvector, Graph S) {
	int i,j;
	int one,zero;
	
	for(i=0;i<S.numEdge;i++) {
		one = 0;
		zero = 0;
		for(j=0;j<S.numPath;j++) {
			if(S.content[j*S.numEdge+i]=='1')
				one++;
			else if(S.content[j*S.numEdge+i]=='0')
				zero++;
		}
		if(zero >= one)
			cvector[i] = '0';
		else
			cvector[i] = '1';
	}
	cvector[i] = '\0';
	return cvector;
}

char *read_vector(char *cvector,char *filename, Graph S) {
	int num = S.numEdge;
	cvector = new char[num+1];
	FILE *fp;
	int i = 0;
	int tt;
	
	if(strcmp(filename,"d")!=0 && strcmp(filename,"m")!=0) {
		fp = fopen(filename,"r");
		
		for(i=0;i<num;i++) {	
			if(fscanf(fp,"%d",&tt)==EOF) {
				printf("vector information error!\n");
				exit(0);
			}
			cvector[i] = (char)tt+48;
		}
		cvector[i] = '\0';
		fclose(fp);
	} else if(strcmp(filename,"m")==0) {
		cvector = generate_default(cvector,S);
	} else if(strcmp(filename,"d")==0) {
		for(i=0;i<num;i++) 
			cvector[i] = '0';
		cvector[i] = '\0';
	}
	return cvector;
}

Graph convert_data(Graph S, char *vector) {
	int i,j;
	
	for(i=0;i<S.numEdge;i++) {
		if(vector[i] == '1') {
			for(j=0;j<S.numPath;j++) {
				if(S.content[j*S.numEdge+i]=='1')
					S.content[j*S.numEdge+i] = '0';
				else if(S.content[j*S.numEdge+i]=='0')
					S.content[j*S.numEdge+i] = '1';
			}
		}
	}
	
	return S;
}


int check_order(Graph S) {
	int i,j;
	int check = 0,check2 = 0;
	int previous = sortcount[0];
	int hasone = 0,hastwo =0,hasone_again=0;
	
	for(i=0;i<S.numPath;i++) {
		hasone = hastwo = hasone_again = 0;
		check = check2 = 0;
		for(j=0;j<S.numEdge;j++) {
			if(previous!=sortcount[j]) {
				if(check==1 && check2 ==2) {
					return 0;
				}
				check = 0;
				check2 = 0;
			}
			if(S.content[i*S.numEdge+j] == '1') {
				if(hastwo==1) {
					return 0;
				}
				check = 1;
			
				hasone = 1;
			}
			if(S.content[i*S.numEdge+j] == '2') {
				check2 = 2;
				hastwo = 1;
			}
			
			previous = sortcount[j];
		}	
		
		if(check==1 && check2==2) {
			return 0;
		}
			
	}
			
	return 1;
}

tree Find_leaves(tree T1, Graph S) {
	int i,j,last;
	char *value;
	
	for(i=0;i<S.numPath;i++) {
		last = -1;
		for(j=0;j<S.numEdge;j++) {
			if(S.content[i*S.numEdge+j] == '1')
				last = j;
		}
		if(last!=-1) {
			value = search_string(S.Edge,last);
			
			T1.node = appendData(T1.node,value);
			T1.node = appendData(T1.node,"leaves");
		} else {
			T1.node = appendData(T1.node,"root");
			T1.node = appendData(T1.node,"leaves");
			
		}
		value = generate_node("p",i/2+1);
			
		if(i%2==0)
			value = stringcat(value,'a');
		else
			value = stringcat(value,'b');
				
		T1.edge = appendData(T1.edge,value);
	}
	return T1;
}

int getnumber(char *value,int len) {
	char str[10];
	int m=0,j,row;
	len = strlen(value);
	for(j=1;j<len;j++) {
		str[m] = value[j];
		m++;
	}
	str[m] = '\0';
				
	row = atoi(str);
	return row-1;
}

stack traverse_tree(FILE *fp, tree T, stack s, char *root, char *output,int width,Data *label) {
	int i,row,column,clen,l,j,pos;
	int len = DataLength(T.edge);
	Data *hnode = T.node;
	Data *hedge = T.edge;
	int stringlen;
	char aorb;
	int found = 0;
	int found2 = 0;
	
	for(i=0;i<len;i++) {
		if(strcmp(hnode->value,root)==0) {
			if(strcmp(hnode->next->value,"leaves")==0) {
				if(found == 1)
					fprintf(fp," , ");
					
				found = 1;	
				fprintf(fp," e %s ",hedge->value);
					
				stringlen = strlen(hedge->value);
				aorb = hedge->value[stringlen-1];
				
				row = getnumber(hedge->value,stringlen-1); // get row number of path
				
				Data *hcontent = s.content;
				clen = DataLength(s.content);
				
				for(l=0;l<clen;l++) {
					stringlen = strlen(hcontent->value);
					column = getnumber(hcontent->value,stringlen);
					
					if(aorb == 'a')
						output[2*row*width+column] = '1';
					else
						output[(2*row+1)*width+column] = '1';
					hcontent = hcontent->next;
				}
			} else {			
				found = 1;
				
				s = push(s,hedge->value);
				
				if(found2 == 1)
					fprintf(fp," , ");
					
				fprintf(fp,"%s ",hedge->value);
				
				Data *hlabel = label;
				pos = search_pos(label,hedge->value);
				for(j=0;j<pos+1;j++)
					hlabel = hlabel->next;
					
				while(strcmp(hlabel->value,"|")!=0 && strcmp(hlabel->value,"END")!=0) {
					fprintf(fp," %s",hlabel->value);
					hlabel = hlabel->next;
				}
				
				fprintf(fp," ( ");
				s = traverse_tree(fp,T,s,hnode->next->value,output,width,label);
				
				fprintf(fp," ) ");
				s = pop(s);
				found2 = 1;
			}
			
		}
		
		hedge = hedge->next;
		hnode = hnode->next->next;
	}
	return s;
}

// Generate the haplotypes from T and reports T in the Newick format
char *create_map(char *filename, tree T, Data *Label, int numEdge, int numPath) {
	stack s;
	int i,j,len,index,stringlen,row;
	char *output = new char[2*numEdge*numPath+1];
	FILE *fp = fopen(filename,"w");
	
	for(i=0;i<2*numEdge*numPath;i++)
		output[i] = '0';
	output[i] = '\0';
	
	s.content = newData();
	
	fprintf(fp,"( ");
	traverse_tree(fp,T,s,"root",output,numEdge,Label);
	
	// handle same labels
	len = DataLength(Label);
	Data *hlabel = Label;
	
	index = -1;
	for(i=0;i<len;i++) {
		if(strcmp(hlabel->value,"|")==0) {
			index = -1;
		} else {
			stringlen = strlen(hlabel->value);
			row = getnumber(hlabel->value,stringlen);
			if(index==-1)
				index = row;
			else {
				//printf("%d %d\n",row,index);
				for(j=0;j<2*numPath;j++)
					output[j*numEdge+row] = output[j*numEdge+index];
			}
		}
		hlabel = hlabel->next;			
	}
	
	fprintf(fp," );\n");
	fclose(fp);
	
	return output;
}

// Build a tree from the haplotypes
int build_tree(void)
{
	int i,j,len;
	Graph S,G;
	
	char *output;
	char *filename = "temp.1co";
	
	// read input haplotypes
	S = init(filename);
	
	len = S.numEdge;
	
	int *count = new int[S.numEdge];

	// calculate the leaf-counts
	count = caculate_leaf_count(S);
	
	// sort the graph by the leaf-counts
	G = sort_graph(S,count); 
			
	LabelSet = newData();

	// find duplicate columns
	G = handle_duplicate(G);
	
	Data *R1;
	tree T1;
	T1.edge = newData();
	T1.node = newData();

	ResultA = newDataA();
	for(i=0;i<G.numPath;i++) {
		ResultA = appendDataA(ResultA,"HEAD");
	}

	// create paths
	R1 = create_R1(G);
	
	// create a tree according to the paths
	create_T1(R1,G,2);
		
	T1 = Merge_Path(T1,G.numPath);
	T1 = Find_leaves(T1,G);
	
	output = create_map("temp.1ct",T1,LabelSet,G.numEdge,G.numPath);
			
	for(i=0;i<G.numPath;i++)
		for(j=0;j<G.numEdge;j++) {
			if(output[i*G.numEdge+j]!=S.content[i*S.numEdge+j]) {
				return 0;
			}
		}
	
	printf("A phylogeny tree has been found and matches the output.\n");
	
	//delete_graph(S);
	//delete_graph(G);
	//delete_data(PathSet);
	//delete_data(extra_edges);
	//delete_dataA(ResultA);
	//delete_data(LabelSet);
	return 1;
}
