#!/usr/bin/env python

## Note: takes in reads mapped to genomic sequence

## To update:
# 1. fix simplified code for overlap between gene models
# 2. update Eland read filtering (are we considering all-uniquely mapping reads and are we excluding all multiply-mapping reads?)

import sys
from process_gene_models_helper_functions import *
from read_in_SNP_data__module import *
from read_alignment_statistics_per_locus__module import *

if len(sys.argv) != 6:
	raise IOError, 'Usage: ./statistics_per_locus.py SAM_filename_1[,SAM_filename_2,...] sequence_feature_annotations_filename SNP_filename output_file pipeline_log_file'

## WARNING: Currently, analysis_option is automatically set to 'full'
analysis_option = 'full'

def main(SAM_filename_list_string, sequence_feature_annotations_filename, SNP_filename, output_filename, pipeline_log_filename):

	## read in exons
	locus_to_models_dict, model_to_exons_dict, model_to_trimmed_exons_dict, locus_to_exons_of_selected_model_dict, \
			locus_to_combined_length_of_non_overlapping_regions_of_selected_exon_model_dict, \
			general_location_to_loci_dict \
			= get_exon_data(sequence_feature_annotations_filename)
	print 'DEBUG: Got exon data - %d loci, %d models' % \
			(len(locus_to_models_dict), len(model_to_exons_dict))
	## read in SNPs
	SNP_dict, SNP_to_refbase_dict, SNP_to_base_change_dict = get_SNP_data(SNP_filename)
	print 'DEBUG: Read in SNPs'

	####### DEBUG #############
	#for locus in ['AT1G01160', 'AT1G01170']:
	#	for exon in locus_to_exons_of_selected_model_dict[locus]:
	#		[exon_start, exon_stop] = exon
	#		print '%s: %d-%d' % (locus, exon_start, exon_stop)
	#raise IOError, 'stop!'

	SAM_filename_list = SAM_filename_list_string.split(',')
	locus_to_RPKM_dict, locus_to_num_reads_dict, locus_to_num_reads_per_allele_dict = \
		get_read_statistics_per_locus_from_multiple_SAM_files(SAM_filename_list, locus_to_models_dict, model_to_trimmed_exons_dict, locus_to_exons_of_selected_model_dict, \
								      locus_to_combined_length_of_non_overlapping_regions_of_selected_exon_model_dict, \
								      general_location_to_loci_dict, SNP_dict, SNP_to_refbase_dict, SNP_to_base_change_dict, \
								      analysis_option, pipeline_log_filename)

	print_read_statistics_per_locus(locus_to_RPKM_dict, locus_to_num_reads_dict, locus_to_num_reads_per_allele_dict,
					locus_to_exons_of_selected_model_dict, analysis_option, output_filename)
	return

main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])
