#!/usr/bin/env python

## Note: takes in reads mapped to genomic sequence

## To update:
# 1. fix simplified code for overlap between gene models
# 2. update Eland read filtering (are we considering all-uniquely mapping reads and are we excluding all multiply-mapping reads?)

import sys
from process_gene_models_helper_functions import *
from read_in_SNP_data__module import *
from read_alignment_statistics_per_locus__module import *

if len(sys.argv) != 7:
	raise IOError, 'Usage: ./statistics_per_locus_for_multiple_separate_analyses.py file_listing_sam_files_for_each_analysis sequence_feature_annotations_file SNP_file analysis_option file_listing_output_files_for_each_analysis pipeline_log_file'

def main(filename_listing_sam_files_for_each_analysis, sequence_feature_annotations_filename, SNP_filename, analysis_option, filename_listing_output_files_for_each_analysis, pipeline_log_filename):


	## get the lists of input and output files
	#SAM_filename_list = SAM_filename_list_string.split(',')
	#output_filename_list = output_filename_list_string.split(',')

	SAM_filename_list = get_list_of_elements_from_file(filename_listing_sam_files_for_each_analysis)
	output_filename_list = get_list_of_elements_from_file(filename_listing_output_files_for_each_analysis)


	if len(SAM_filename_list) != len(output_filename_list):
		raise IOError, 'Expected that there will be exactly one output file listed for each SAM file'
	num_analyses = len(SAM_filename_list)

	## read in exons
	locus_to_models_dict, model_to_exons_dict, model_to_trimmed_exons_dict, locus_to_exons_of_selected_model_dict, \
			locus_to_combined_length_of_non_overlapping_regions_of_selected_exon_model_dict, \
			general_location_to_loci_dict \
			= get_exon_data(sequence_feature_annotations_filename)
	print 'DEBUG: Got exon data - %d loci, %d models' % \
			(len(locus_to_models_dict), len(model_to_exons_dict))
	## read in SNPs
	SNP_dict, SNP_to_refbase_dict, SNP_to_base_change_dict = get_SNP_data(SNP_filename)
	print 'DEBUG: Read in SNPs'

	####### DEBUG #############
	#for locus in ['AT1G01160', 'AT1G01170']:
	#	for exon in locus_to_exons_of_selected_model_dict[locus]:
	#		[exon_start, exon_stop] = exon
	#		print '%s: %d-%d' % (locus, exon_start, exon_stop)
	#raise IOError, 'stop!'

	for SAM_filename in SAM_filename_list:
		print 'DEBUG: SAM_filename is %s' % SAM_filename
		SAM_file = open(SAM_filename, 'r')
		SAM_file.close()
	
	## perform a separate run for each input/output file pair
	for analysis_index in range(0, num_analyses):
		locus_to_RPKM_dict, locus_to_num_reads_dict, locus_to_num_reads_per_allele_dict = \
			get_read_statistics_per_locus_from_multiple_SAM_files([SAM_filename_list[analysis_index]], locus_to_models_dict, model_to_trimmed_exons_dict, \
									      locus_to_exons_of_selected_model_dict, \
									      locus_to_combined_length_of_non_overlapping_regions_of_selected_exon_model_dict, \
									      general_location_to_loci_dict, SNP_dict, SNP_to_refbase_dict, SNP_to_base_change_dict, \
									      analysis_option, pipeline_log_filename)
        
		print_read_statistics_per_locus(locus_to_RPKM_dict, locus_to_num_reads_dict, locus_to_num_reads_per_allele_dict,
						locus_to_exons_of_selected_model_dict, analysis_option, output_filename_list[analysis_index])
	return

def get_list_of_elements_from_file(filename):
	element_list = []
	file = open(filename, 'r')
	for line in file:
		element = line.strip()
		element_list.append(element)
	file.close()
	return element_list

main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6])
