#!/usr/bin/env python

import sys, input_output

#############################################################
################### Homolog Alignment #######################
#############################################################

def compute_homolog_alignment_and_read_mapping_probability_distribution_for_each_gene(gene_list, clustal_multiple_alignment_filename, gene_to_tilling_sequence_dict, read_length): 
	gene_to_homolog_alignment_dict = {}
	probability_read_from_gene_at_position_maps_to_gene_dict = {}
	for specified_gene in gene_list:
		homolog_list = [gene for gene in gene_list]
		homolog_list.remove(specified_gene)

		gene_to_homolog_alignment_dict[specified_gene] = get_homolog_alignment_dictionary(clustal_multiple_alignment_filename, homolog_list, specified_gene)
		probability_read_maps_to_gene_dict = get_probability_read_maps_to_gene_dictionary(specified_gene, homolog_list, gene_to_homolog_alignment_dict[specified_gene], 
												  gene_to_tilling_sequence_dict, read_length)
		probability_read_from_gene_at_position_maps_to_gene_dict[specified_gene] = \
				get_probability_read_at_position_maps_to_gene_dictionary(specified_gene, homolog_list, probability_read_maps_to_gene_dict, gene_to_tilling_sequence_dict,
											read_length)
	return gene_to_homolog_alignment_dict, probability_read_from_gene_at_position_maps_to_gene_dict

def get_homolog_alignment_dictionary(clustal_multiple_alignment_filename, homolog_list, specified_gene_name):
	homolog_alignment_dict = {}
	for homolog in homolog_list:
		homolog_alignment_dict[homolog] = {}

	# create list of the specified gene and all homologs
	gene_name_list = [homolog for homolog in homolog_list]
	gene_name_list.append(specified_gene_name)

	current_position_in_gene_dict = {}
	for gene_name in gene_name_list:
		current_position_in_gene_dict[gene_name] = 0
		
	
	clustal_multiple_alignment_file = open(clustal_multiple_alignment_filename, 'r')
	# skip header
	for i in range(0, 3):
		clustal_multiple_alignment_file.readline()
	
	while True:
		gene_to_partial_alignment_sequence_dict = get_gene_to_partial_alignment_sequence_dictionary(clustal_multiple_alignment_file, homolog_list, specified_gene_name)

		number_of_positions = len(gene_to_partial_alignment_sequence_dict[specified_gene_name])
		for i in range(0, number_of_positions):
			# update current position in each gene
			for gene_name in gene_name_list:
				current_base = gene_to_partial_alignment_sequence_dict[gene_name][i]
				if not (current_base in ['-', 'A', 'C', 'G', 'T']):
					raise IOError, 'Read bad base from clustal multiple alignment file: \'' + current_base + '\''
				if current_base != '-':
					current_position_in_gene_dict[gene_name] += 1
			# update homolog_alignment_dict
			if gene_to_partial_alignment_sequence_dict[specified_gene_name][i] != '-':
				for homolog in homolog_list:
					if gene_to_partial_alignment_sequence_dict[homolog][i] != '-':
						homolog_alignment_dict[homolog][current_position_in_gene_dict[specified_gene_name]] = current_position_in_gene_dict[homolog]
		
		# skip the line listing the similarity between all three sequences
		line = clustal_multiple_alignment_file.readline()
		# this should be an empty line, or the end of the file
		line = clustal_multiple_alignment_file.readline()
		if line == "":
			# end of file
			break
		else:
			# this should be an empty line
			if line.strip() != "":
				raise IOError, 'Unexpected line format in multiple sequence alignment file'
			continue
	return homolog_alignment_dict

def get_gene_to_partial_alignment_sequence_dictionary(clustal_multiple_alignment_file, homolog_list, specified_gene_name):
	gene_to_partial_alignment_sequence_dict = {}
	for i in range(0, (len(homolog_list) + 1)):
		line = clustal_multiple_alignment_file.readline()
		line = line.strip()
		field_list = line.split()
		if len(field_list) != 2:
			raise IOError, 'Expected exactly two fields on line'
		gene_name = field_list[0]
		base_sequence_string = field_list[1]
		
		if (not (gene_name in homolog_list)) and (gene_name != specified_gene_name):
			raise IOError, 'Error: cannot recognize homolog ' + gene_name + ' from clustal multiple alignment file'
		if gene_name in gene_to_partial_alignment_sequence_dict:
			raise IOError, 'Error: saw homolog ' + gene_name + ' twice'

		gene_to_partial_alignment_sequence_dict[gene_name] = base_sequence_string
	
	# check that the base sequence lengths are all the same
	sequence_length = len(gene_to_partial_alignment_sequence_dict[specified_gene_name])
	for homolog in homolog_list:
		if len(gene_to_partial_alignment_sequence_dict[homolog]) != sequence_length:
			raise IOError, 'Partial alignment sequences have differing lengths'
	# TODO: Do even more error checking?
	return gene_to_partial_alignment_sequence_dict

# Note:
#     The script assumes that if a read can map to a homolog, that it must do so at the spot indicated by the multiple sequence alignment
def get_probability_read_maps_to_gene_dictionary(specified_gene_name, homolog_list, homolog_alignment_dict, gene_to_tilling_sequence_dict, read_length):
	probability_read_maps_to_gene_dict = {}
	
	for read_start in range(1, len(gene_to_tilling_sequence_dict[specified_gene_name]) - read_length + 2):
		probability_read_maps_to_gene_dict[read_start] = {}
		num_matching_homologs = 0
		for homolog in homolog_list:
			if read_from_specified_gene_matches_to_homolog(read_start, specified_gene_name, homolog, homolog_alignment_dict, gene_to_tilling_sequence_dict, read_length):
				num_matching_homologs += 1
				probability_read_maps_to_gene_dict[read_start][homolog] = 1.0
			else:
				probability_read_maps_to_gene_dict[read_start][homolog] = 0.0

		num_matching_genes = (num_matching_homologs + 1)
		for homolog in homolog_list:
			probability_read_maps_to_gene_dict[read_start][homolog] /= float(num_matching_genes)

		probability_read_maps_to_gene_dict[read_start][specified_gene_name] = 1.0 / float(num_matching_genes)

	return probability_read_maps_to_gene_dict

# Check if a read starting at position "read_start" of the tilling sequence of the specified gene
# aligns with no mismatches to the tilling sequence of the selected homolog,
# in the specific multiple alignment provided by clustal
def read_from_specified_gene_matches_to_homolog(read_start, specified_gene_name, homolog, homolog_alignment_dict, gene_to_tilling_sequence_dict, read_length):
	read_matches = True
	for read_index in range(0, read_length):
		position_in_gene = read_start + read_index
		if not (position_in_gene in homolog_alignment_dict[homolog]):
			# if a position in read does not align to the homolog
			read_matches = False
			break
		position_in_homolog = homolog_alignment_dict[homolog][position_in_gene]
		if read_index == 0:
			gene_to_homolog_read_offset = position_in_gene - position_in_homolog
		else:
			if (position_in_gene - position_in_homolog) != gene_to_homolog_read_offset:
				read_matches = False
				break
		if gene_to_tilling_sequence_dict[specified_gene_name][position_in_gene - 1] != gene_to_tilling_sequence_dict[homolog][position_in_homolog - 1]:
			read_matches = False
			break
	return read_matches

# Note:
#     We assume equal coverage over the starting positions of reads that cover a particular position
def get_probability_read_at_position_maps_to_gene_dictionary(specified_gene_name, homolog_list, probability_read_maps_to_gene_dict, gene_to_tilling_sequence_dict, read_length):
	probability_read_at_position_maps_to_gene_dict = {}
	for position in range(1, len(gene_to_tilling_sequence_dict[specified_gene_name]) + 1):
		probability_read_at_position_maps_to_gene_dict[position] = {}
		read_range_start = max(1, position - read_length + 1) 
		read_range_stop = min(len(gene_to_tilling_sequence_dict[specified_gene_name]) - read_length + 1, position)
		read_range = range(read_range_start, read_range_stop + 1)
		number_of_reads_for_position = read_range_stop - read_range_start + 1
		
		gene_list = [gene for gene in homolog_list]
		gene_list.append(specified_gene_name)

		for gene in gene_list:
			sum_over_reads_of_probability_read_maps_to_gene = 0
			for read_start in read_range:
				sum_over_reads_of_probability_read_maps_to_gene += probability_read_maps_to_gene_dict[read_start][gene]
			prob_read_covering_position_maps_to_gene = float(sum_over_reads_of_probability_read_maps_to_gene) / float(number_of_reads_for_position)

			if (abs(1.0 - prob_read_covering_position_maps_to_gene) != 0.0) and (abs(1.0 - prob_read_covering_position_maps_to_gene) < 1e-12):
				raise IOError, 'Error: there may be a rounding error causing prob_read_covering_position_maps_to_gene to not be set to exactly 1.0.' + \
						'It has been set to %.20f' % prob_read_covering_position_maps_to_gene

			probability_read_at_position_maps_to_gene_dict[position][gene] = prob_read_covering_position_maps_to_gene



	## output mapping probabilities for each position
	#for gene in gene_list:
	#	sys.stdout.write('\t' + gene)
	#sys.stdout.write('\n')
	#for position in range(1, len(gene_to_tilling_sequence_dict[specified_gene_name]) + 1):
	#	sys.stdout.write('%d' % position)
	#	for gene in gene_list:
	#		sys.stdout.write('\t%.3f' % probability_read_at_position_maps_to_gene_dict[position][gene])
	#	sys.stdout.write('\n')
	#raise IOError, 'DEBUG: stop after printing read mapping probability distribution'
	return probability_read_at_position_maps_to_gene_dict


