// preprocess.c
// Evan Lord
// Created: July 10, 2007
// Last Modified: July 23, 2007


#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <mcheck.h>
#include "constants.h"
#include "struct_def.h"


// This function normalizes the data so that each dimension is the edge
// of a unit hypercube.  The function receives the DATA struct containing
// the original data and returns the DATA struct containing the 
// normalized data.

void Normalize(struct DATA * data_to_normalize)
{
  float min;          // minimum value for current dimension
  float max;          // maximum value for current dimension
  float temp_data;    

  // loop counters
  int int_outer_loop;
  int int_inner_loop;
  
  // allocate memory to hold minimum values for each dimension
  if((data_to_normalize->mins = (float *)malloc(data_to_normalize->int_dimensions * sizeof(float))) == NULL){
    fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
    exit(1);
  }

  // allocate memory to hold maximum values for each dimension
  if((data_to_normalize->maxes = (float *)malloc(data_to_normalize->int_dimensions * sizeof(float))) == NULL){
    fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
    exit(1);
  }

  // loop over all dimensions
  for(int_outer_loop = 0; int_outer_loop < data_to_normalize->int_dimensions; int_outer_loop++){

    // if the data for the current dimension is numeric
    //if((data_to_normalize->str_names[int_outer_loop] = 'n')){

    // initialize both min and max to first value in the column
    min = data_to_normalize->data_array[0][int_outer_loop];
    max = data_to_normalize->data_array[0][int_outer_loop];
    
    // loop over entire column
    for(int_inner_loop = 1; int_inner_loop < data_to_normalize->int_num_data_points; int_inner_loop++){
      
      // update min if smaller value is found
      if(data_to_normalize->data_array[int_inner_loop][int_outer_loop] < min){
	min = data_to_normalize->data_array[int_inner_loop][int_outer_loop];
      }
      
      // update max if larger value is found
      if(data_to_normalize->data_array[int_inner_loop][int_outer_loop] > max){
	max = data_to_normalize->data_array[int_inner_loop][int_outer_loop];
      }
    }
    
    // store min and max for current dimension
    data_to_normalize->mins[int_outer_loop] = min;
    data_to_normalize->maxes[int_outer_loop] = max;
    
    // loop through entire column again and normalize using min and max values
    for(int_inner_loop = 0; int_inner_loop < data_to_normalize->int_num_data_points; int_inner_loop++){
      temp_data = data_to_normalize->data_array[int_inner_loop][int_outer_loop];
      temp_data = (temp_data - min) / (max - min);
      data_to_normalize->data_array[int_inner_loop][int_outer_loop] = temp_data;
    }
  }
}


// This function reads the constraints from the constraints file.  It receives the name of the
// constraints file and returns a struct containing an array of constraints and the number of
// constraints.

void ReadConstraints(char * str_constraints_file, struct DATA * data)
{ 
  FILE * pConstraints;   // pointer to constraints file

  int ** ML_constraints;    // array of constraints
  int ** CL_constraints;
  int ** temp;           // temporary array used in dynamic allocation

  int int_ML_rows_alloc;    // number of rows allocated in array
  int int_num_ML_rows;      // number of rows filled
  int int_CL_rows_alloc;
  int int_num_CL_rows;

  // loop counters
  int int_loop_count;
  int int_outer_loop;
  int int_inner_loop;

  int temp_storage[CONSTRAINT_COLS];

  int int_data_point_index;
  int int_other_point_index;
  struct CONSTRAINT * new_constraint_node;

  // allocate memory for the constraint involvement arrays
  if((data->ML_constraint_involvement = (struct CONSTRAINT ***)malloc(data->int_num_data_points * sizeof(struct CONSTRAINT **))) == NULL){
    fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
    exit(1);
  }
  else{
    for(int_loop_count = 0; int_loop_count < data->int_num_data_points; int_loop_count++){
      if((data->ML_constraint_involvement[int_loop_count] = (struct CONSTRAINT **)malloc(2 * sizeof(struct CONSTRAINT *))) == NULL){
	fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
	exit(1);
      }
      else{
	
	// initialize all pointers to NULL 
	data->ML_constraint_involvement[int_loop_count][HEAD] = NULL;
	data->ML_constraint_involvement[int_loop_count][TAIL] = NULL;
	
      }
    }
  }
  if((data->CL_constraint_involvement = (struct CONSTRAINT ***)malloc(data->int_num_data_points * sizeof(struct CONSTRAINT **))) == NULL){
    fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
    exit(1);
  }
  else{
    for(int_loop_count = 0; int_loop_count < data->int_num_data_points; int_loop_count++){
      if((data->CL_constraint_involvement[int_loop_count] = (struct CONSTRAINT **)malloc(2 * sizeof(struct CONSTRAINT *))) == NULL){
	fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
	exit(1);
      }
      else{
	
	// initialize all pointers to NULL 
	data->CL_constraint_involvement[int_loop_count][HEAD] = NULL;
	data->CL_constraint_involvement[int_loop_count][TAIL] = NULL;
	
      }
    }
  }
  



  // open constraints file for reading
  if((pConstraints = fopen(str_constraints_file, "r")) == NULL){
    fprintf(stderr, "\nThe file \"%s\" could not be opened.  ", str_constraints_file);
    fprintf(stderr, "The program will now terminate.\n\n");
    exit(1);
  }
  
  // allocate memory for 2D array of ML constraints
  if((ML_constraints = (int **)malloc(INCREMENT * sizeof(int *))) == NULL){
    fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
    exit(1);
  }
  else{
    for(int_loop_count = 0; int_loop_count < INCREMENT; int_loop_count++){
      if((ML_constraints[int_loop_count] = (int *)malloc((CONSTRAINT_COLS + 2) * sizeof(int))) == NULL){
	fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
	exit(1);
      }
    }
  }
  
  int_ML_rows_alloc = INCREMENT;
  
  // allocate memory for 2D array of CL constraints
  if((CL_constraints = (int **)malloc(INCREMENT * sizeof(int *))) == NULL){
    fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
    exit(1);
  }
  else{
    for(int_loop_count = 0; int_loop_count < INCREMENT; int_loop_count++){
      if((CL_constraints[int_loop_count] = (int *)malloc((CONSTRAINT_COLS + 2) * sizeof(int))) == NULL){
	fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
	exit(1);
      }
    }
  }
  
  int_CL_rows_alloc = INCREMENT;
  
  int_num_ML_rows = 0;
  int_num_CL_rows = 0;
  
  // read first integer from constraints file
  fscanf(pConstraints, "%d", &temp_storage[0]);
  
  // loop until reaching the end of the constraints file
  while(!feof(pConstraints)){
    
    // if the number of ML rows allocated is not enough, reallocate memory for larger array
    if(int_num_ML_rows >= int_ML_rows_alloc){
      if((temp = (int **)realloc(ML_constraints, ((int_ML_rows_alloc + INCREMENT) * sizeof(int *)))) == NULL){
	fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
	exit(1);
      }
      else{
	for(int_loop_count = int_ML_rows_alloc; int_loop_count < int_ML_rows_alloc + INCREMENT; int_loop_count++){
	  if((temp[int_loop_count] = (int *)malloc((CONSTRAINT_COLS + 2) * sizeof(int))) == NULL){
	    fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
	    exit(1);
	  }
	}
	
	int_ML_rows_alloc = int_ML_rows_alloc + INCREMENT;
	ML_constraints = temp;
      }
    }
    
    // if the number of CL rows allocated is not enough, reallocate memory for larger array
    if(int_num_CL_rows >= int_CL_rows_alloc){
      if((temp = (int **)realloc(CL_constraints, ((int_CL_rows_alloc + INCREMENT) * sizeof(int *)))) == NULL){
	fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
	exit(1);
      }
      else{
	for(int_loop_count = int_CL_rows_alloc; int_loop_count < int_CL_rows_alloc + INCREMENT; int_loop_count++){
	  if((temp[int_loop_count] = (int *)malloc((CONSTRAINT_COLS + 2) * sizeof(int))) == NULL){
	    fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
	    exit(1);
	  }
	}
	
	int_CL_rows_alloc = int_CL_rows_alloc + INCREMENT;
	CL_constraints = temp;
      }
    }
    
    // loop over current line of file
    for(int_loop_count = 1; int_loop_count < CONSTRAINT_COLS; int_loop_count++){
      
      // read the integer in the current column of the current row and store in array
      fscanf(pConstraints, "%d", &temp_storage[int_loop_count]);
      
    }
    
    if(temp_storage[CONSTRAINT_COLS - 1] == 1){
      for(int_loop_count = 0; int_loop_count < CONSTRAINT_COLS; int_loop_count++){
	ML_constraints[int_num_ML_rows][int_loop_count] = temp_storage[int_loop_count];
      }
      fscanf(pConstraints, "%d", &temp_storage[0]);
      int_num_ML_rows++;
    }
    else{
      if(temp_storage[CONSTRAINT_COLS - 1] == -1){
	for(int_loop_count = 0; int_loop_count < CONSTRAINT_COLS; int_loop_count++){
	  CL_constraints[int_num_CL_rows][int_loop_count] = temp_storage[int_loop_count];
	}
	fscanf(pConstraints, "%d", &temp_storage[0]);
	int_num_CL_rows++;
      }
      else{
	fprintf(stderr, "\n\nError reading constraints.\n\n");
      }
    }
  }
  
  fclose(pConstraints);    // close file
  
  for(int_loop_count = 0; int_loop_count < int_num_ML_rows; int_loop_count++){
    ML_constraints[int_loop_count][2] = -1;
    ML_constraints[int_loop_count][3] = -1;
    ML_constraints[int_loop_count][4] = 0;
  }
  
  for(int_loop_count = 0; int_loop_count < int_num_CL_rows; int_loop_count++){
    CL_constraints[int_loop_count][3] = -1;
    CL_constraints[int_loop_count][4] = 0;
  }
  
  // store results in struct
  data->ML_constraints_array = ML_constraints;
  data->CL_constraints_array = CL_constraints;
  data->int_num_ML_constraints = int_num_ML_rows;
  data->int_num_CL_constraints = int_num_CL_rows;
  
  for(int_outer_loop = 0; int_outer_loop < 2; int_outer_loop++){
    for(int_inner_loop = 0; int_inner_loop < data->int_num_ML_constraints; int_inner_loop++){
      new_constraint_node = (struct CONSTRAINT *)malloc(sizeof(struct CONSTRAINT));
      new_constraint_node->int_constraint_index = int_inner_loop;
      new_constraint_node->int_constraint_col = int_outer_loop;
      int_data_point_index = data->ML_constraints_array[int_inner_loop][int_outer_loop];
      int_other_point_index = data->ML_constraints_array[int_inner_loop][(int_outer_loop + 1) % 2];
      new_constraint_node->int_other_point = int_other_point_index;
      if(data->ML_constraint_involvement[int_data_point_index][TAIL] == NULL){
	data->ML_constraint_involvement[int_data_point_index][TAIL] = new_constraint_node;
      }
      new_constraint_node->pNext = data->ML_constraint_involvement[int_data_point_index][HEAD];
      data->ML_constraint_involvement[int_data_point_index][HEAD] = new_constraint_node;
      new_constraint_node = NULL;
    }
  }
  
  for(int_outer_loop = 0; int_outer_loop < 2; int_outer_loop++){
    for(int_inner_loop = 0; int_inner_loop < data->int_num_CL_constraints; int_inner_loop++){
      new_constraint_node = (struct CONSTRAINT *)malloc(sizeof(struct CONSTRAINT));
      new_constraint_node->int_constraint_index = int_inner_loop;
      new_constraint_node->int_constraint_col = int_outer_loop;
      int_data_point_index = data->CL_constraints_array[int_inner_loop][int_outer_loop];
      int_other_point_index = data->CL_constraints_array[int_inner_loop][(int_outer_loop + 1) % 2];
      new_constraint_node->int_other_point = int_other_point_index;
      if(data->CL_constraint_involvement[int_data_point_index][TAIL] == NULL){
	data->CL_constraint_involvement[int_data_point_index][TAIL] = new_constraint_node;
      }
      new_constraint_node->pNext = data->CL_constraint_involvement[int_data_point_index][HEAD];
      data->CL_constraint_involvement[int_data_point_index][HEAD] = new_constraint_node;
      new_constraint_node = NULL;
    }
  }
}


// This function reads the data from the data file.  It receives the name of the
// data file as well as a string containing the information from the names file.
// It returns a struct containing the data array as well as the total number of
// data points.

void ReadData(char * str_data_file, char * str_names, struct DATA * data_struct)
{
  FILE * pData;                        // pointer to data file

  float ** data;                      // array to hold data
  float ** temp;                      // temporary array used in dynamic allocation

  char ** temp_labels;

  int * temp_distinct;

  char chr_cur_char;                   // current character read from data file

  int int_label_count;
  int int_label_flag;
  int int_label_rows;

  int int_rows_alloc;                  // number of rows allocated in array
  int int_num_rows;                    // number of rows occupied in array

  // loop counters
  int int_loop_count;
  int int_outer_loop;
  int int_inner_loop;

  int int_current_col;                 // current column in array
  int int_cols = strlen(str_names);    // number of columns needed in array

  // open data file for reading
  if((pData = fopen(str_data_file, "r")) == NULL){
    fprintf(stderr, "\nThe file \"%s\" could not be opened.  ", str_data_file);
    fprintf(stderr, "The program will now terminate.\n\n");
    exit(1);
  }

  // allocate memory for 2D array to hold data
  if((data = (float **)malloc(INCREMENT * sizeof(float *))) == NULL){
    fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
    exit(1);
  }
  else{
    for(int_loop_count = 0; int_loop_count < INCREMENT; int_loop_count++){
      if((data[int_loop_count] = (float *)malloc(int_cols * sizeof(float))) == NULL){
	fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
	exit(1);
      }
    }
  }

  // allocate memory for 2D array to hold labels
  if((data_struct->labels_array = (char **)malloc(INCREMENT * sizeof(char *))) == NULL){
    fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
    exit(1);
  }
  else{
    for(int_loop_count = 0; int_loop_count < INCREMENT; int_loop_count++){
      if((data_struct->labels_array[int_loop_count] = (char *)malloc((MAX_LABEL_SIZE + 1) * sizeof(char))) == NULL){
	fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
	exit(1);
      }
    }
  }

  int_rows_alloc = INCREMENT;

  int_num_rows = 1;
  int_current_col = 0;

  chr_cur_char = fgetc(pData);   // get first character from data file

  // loop until reaching the end of the data file
  while(!feof(pData)){

    // if number of rows in array is not enough, reallocate memory for larger array
    if(int_num_rows >= int_rows_alloc){
      if((temp = (float **)realloc(data, ((int_rows_alloc + INCREMENT) * sizeof(float *)))) == NULL){
	fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
	exit(1);
      }
      else{
	for(int_loop_count = int_rows_alloc; int_loop_count < int_rows_alloc + INCREMENT; int_loop_count++){
	  if((temp[int_loop_count] = (float *)malloc(int_cols * sizeof(float))) == NULL){
	    fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
	    exit(1);
	  }
	}
	data = temp;
      }

      if((temp_labels = (char **)realloc(data_struct->labels_array, ((int_rows_alloc + INCREMENT) * sizeof(char *)))) == NULL){
	fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
	exit(1);
      }
      else{
	for(int_loop_count = int_rows_alloc; int_loop_count < int_rows_alloc + INCREMENT; int_loop_count++){
	  if((temp_labels[int_loop_count] = (char *)malloc(MAX_LABEL_SIZE * sizeof(char))) == NULL){
	    fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
	    exit(1);
	  }
	}
	data_struct->labels_array = temp_labels;
      }
      int_rows_alloc = int_rows_alloc + INCREMENT;
    }

   // store text in first column of data file in labels array
   
    if(chr_cur_char != ','){
      data_struct->labels_array[int_num_rows - 1][0] = chr_cur_char;
      int_label_count = 1;
      while((chr_cur_char != ',')){     // && (chr_cur_char != EOF) && (chr_cur_char != '\n')){
	chr_cur_char = fgetc(pData);
	if(chr_cur_char != ','){
	  data_struct->labels_array[int_num_rows - 1][int_label_count] = chr_cur_char;
	}
	else{
	  data_struct->labels_array[int_num_rows - 1][int_label_count] = '\0';
	}
	int_label_count++;
      }
    }

    // loop over current line of the file
    while((chr_cur_char != '\n') && (chr_cur_char != EOF)){

      // if the data is numeric
      if(str_names[int_current_col] == 'n'){

	// read the data from the current column and store it in the array
	fscanf(pData, "%f", &data[int_num_rows - 1][int_current_col]);

	chr_cur_char = fgetc(pData);     // get the next character from the data file

	// if the end of the line has not been reached
	if(chr_cur_char != '\n'){
	  while((chr_cur_char != ',') && (chr_cur_char != '\n')){      // ignore all characters until reaching a comma
	    chr_cur_char = fgetc(pData);
	  }
	}

	int_current_col++;
      }

      // if the data is not numeric
      else{
	chr_cur_char = fgetc(pData);
	if(chr_cur_char == 'F'){
	  data[int_num_rows - 1][int_current_col] = 0;
	}
	else{
	  data[int_num_rows - 1][int_current_col] = 1;
	}

	// if the end of the line has not been reached
	if(chr_cur_char != '\n'){
	  while((chr_cur_char != ',') && (chr_cur_char != '\n')){      // ignore all characters until reaching a comma
	    chr_cur_char = fgetc(pData);
	  }
	}

	int_current_col++;
      }
    }

    chr_cur_char = fgetc(pData);    // get the next character from the file
    int_current_col = 0;            // start at the first column

    if(!feof(pData)){
      int_num_rows++;
    }
  }

  fclose(pData);   // close file

  // store results in struct
  data_struct->data_array = data;
  data_struct->int_num_data_points = int_num_rows;

  
  /********************Find number of distinct labels***************************************/

  
  if((data_struct->distinct_labels = (int *)malloc(LABELS_INCREMENT * sizeof(int))) == NULL){
    fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
    exit(1);
  }

  int_label_rows = LABELS_INCREMENT;

  data_struct->int_num_labels = 0;
  for(int_outer_loop = 0; int_outer_loop < data_struct->int_num_data_points; int_outer_loop++){
    int_label_flag = 0;
    for(int_inner_loop = 0; int_inner_loop < data_struct->int_num_labels; int_inner_loop++){
      if(!strcmp(data_struct->labels_array[int_outer_loop], data_struct->labels_array[data_struct->distinct_labels[int_inner_loop]])){
	int_label_flag = 1;
      }
    }
    if(int_label_flag == 0){
      data_struct->distinct_labels[data_struct->int_num_labels] = int_outer_loop;
      data_struct->int_num_labels++;
    }
    
    if(data_struct->int_num_labels == int_label_rows){
     
      if((temp_distinct = (int *)realloc(data_struct->distinct_labels, ((int_label_rows + LABELS_INCREMENT) * sizeof(int)))) == NULL){
	fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
	exit(1);
      }
      else{
	data_struct->distinct_labels = temp_distinct;
      }
    }
  }
  
 
}


// This function reads the names file and stores its contents in a string.  The function
// receives the name of the file and returns the string.

void ReadNames(char * str_names_file, struct DATA * data )
{
  FILE * pNames;        // pointer to names file

  char * str_names;     // string to hold the names information
  char * str_temp;      // temporary string used in dynamic allocation

  char chr_cur_char;    // current character read from the names file

  int int_cols_alloc;   // number of columns allocated for the names string
  int int_num_cols = 0; // number of columns used in the names string

  // open names file for reading
  if((pNames = fopen(str_names_file, "r")) == NULL){
    fprintf(stderr, "\nThe file \"%s\" could not be opened.  ", str_names_file);
    fprintf(stderr, "The program will now terminate.\n\n");
    exit(1);
  }
  
  // allocate memory for the names string
  if((str_temp = (char *)malloc((INCREMENT + 1) * sizeof(char))) == NULL){
    fprintf(stderr, "\nMemory allocation error.  Closing program.\n\n");
    exit(1);
  }
  else{
    str_names = str_temp;
    int_cols_alloc = INCREMENT;
  }
 
  chr_cur_char = fgetc(pNames);   // get first character from names file

  if(feof(pNames)){
    fprintf(stderr, "\nThe names file \"%s\" is empty.\n", str_names_file);
  }
  else{

    // loop over entire line of names file
    while((chr_cur_char != '\n') && (chr_cur_char != EOF)){

      // if the string is not long enough, reallocate memory for longer string
      if(int_num_cols > int_cols_alloc){
	if((str_temp = (char *)realloc(str_names, ((int_cols_alloc + INCREMENT + 1) * sizeof(char)))) == NULL){
	  fprintf(stderr, "\nMemory allocation error.  Closing program.\n");
	  exit(1);
	}
	else{
	  str_names = str_temp;
	  int_cols_alloc = int_cols_alloc + INCREMENT;
	}
      }

      if(chr_cur_char != ','){
	str_names[int_num_cols] = chr_cur_char;
	int_num_cols++;
      }

      chr_cur_char = fgetc(pNames);   // get next character from names file
    }
  }

  fclose(pNames);   // close file

  data->str_names = str_names;
  data->int_dimensions = strlen(str_names);
}
