#!/usr/bin/env python
import sys

if len(sys.argv) != 6:
	raise IOError, './get_orthologs_with_low_genome_mapping_bias.py ortholog_file refP_to_ref_mapping_file otherP_to_ref_mapping_file mapping_bias_ratio_threshold number_of_reads_threshold'

def main(ortholog_filename, refP_to_ref_mapping_file, otherP_to_ref_mapping_file, mapping_bias_ratio_threshold_string, number_of_reads_threshold_string):
	number_of_reads_threshold = int(number_of_reads_threshold_string)

	## for each locus in ref, get corresponding orthologs in refP and otherP
	ref_to_refP_ortholog_dict, ref_to_otherP_ortholog_dict = get_ref_to_refP_and_ref_to_otherP_ortholog_dicts(ortholog_filename)

	## Get number of reads for refP and otherP orthologs that map back to the orthologous locus in ref,
	## filter out those ortholog triples for which either the refP or otherP ortholog does not map best to the orthologous locus in ref,
	## filter out each ortholog triple where both the refP and otherP orthologs map back less than
	## \'number_of_reads_threshold\' reads to the orthologous locus in ref,
	## and compute the genome mapping bias ratio for each of the remaining ortholog triples.
	refP_to_best_ref, refP_to_num_reads_for_best_ref = get_x_to_y_best_locus_and_num_reads_for_best_locus(refP_to_ref_mapping_file)
	otherP_to_best_ref, otherP_to_num_reads_for_best_ref = get_x_to_y_best_locus_and_num_reads_for_best_locus(otherP_to_ref_mapping_file)

	ref_to_result_dict = {}
	for ref in ref_to_refP_ortholog_dict:
		refP = ref_to_refP_ortholog_dict[ref]
		otherP = ref_to_otherP_ortholog_dict[ref]

		ref_to_result_dict[ref] = 'NA'
		## filtering checks
		if (refP_to_best_ref[refP] != ref):
			continue
		if (otherP_to_best_ref[otherP] != ref):
			continue

		num_reads_1 = refP_to_num_reads_for_best_ref[refP]
		num_reads_2 = otherP_to_num_reads_for_best_ref[otherP]
		num_reads_min = min(num_reads_1, num_reads_2)
		num_reads_max = max(num_reads_1, num_reads_2)
		if (num_reads_max == 0):
			# cannot call low genome mapping bias
			# because mapping bias ratio is NaN
			continue
		## make sure there is a high enough number of reads for either the refP or otherP ortholog mapping back to the ortholog in ref
		if num_reads_max < number_of_reads_threshold:
			continue
		mapping_bias_ratio = float(num_reads_min) / float(num_reads_max)
		## then depending on the user-specified option 
		if mapping_bias_ratio_threshold_string == 'NA':
			## print the observed genome mapping bias ratio
			ref_to_result_dict[ref] = '%f' % mapping_bias_ratio
		else:
			## or print 'yes' if we observe genome mapping bias less than or equal to user-specified threshold
			mapping_bias_ratio_threshold = float(mapping_bias_ratio_threshold_string)
			if (mapping_bias_ratio >= mapping_bias_ratio_threshold):
				#print '%s' % ref
				ref_to_result_dict[ref] = 'yes'

	## record genome mapping bias ratio
	## for each ortholog triple that was not filtered out
	print 'locus\tgenome_mapping_bias_ratio'
	for ref in ref_to_refP_ortholog_dict:
		if not (ref in ref_to_result_dict):
			raise IOError, 'Internal Error: expected current locus from ref to be in ref_to_result_dict'
		result = ref_to_result_dict[ref]
		print '%s\t%s' % (ref, result)
	return

def get_ref_to_refP_and_ref_to_otherP_ortholog_dicts(ortholog_filename):
	ortholog_file = open(ortholog_filename, 'r')

	ref_to_refP_ortholog_dict = {}
	ref_to_otherP_ortholog_dict = {}

	refP_dict = {}
	otherP_dict = {}

	for line in ortholog_file:
		line = line.rstrip('\n\r')
		field_list = line.split('\t')
		if len(field_list) != 3:
			raise IOError, "Internal Error: Expected there to be three tab-delimited fields per line"
		ref = field_list[0]
		refP = field_list[1]
		otherP = field_list[2]

		if (ref in ref_to_refP_ortholog_dict):
			raise IOError, 'unexpected: saw ref twice in ortholog file'
		if (refP in refP_dict):
			raise IOError, 'unexpected: saw refP twice in ortholog file'
		if (otherP in otherP_dict):
			raise IOError, 'unexpected: saw otherP twice in ortholog file'
		refP_dict[refP] = 0
		otherP_dict[otherP] = 0

		ref_to_refP_ortholog_dict[ref] = refP
		ref_to_otherP_ortholog_dict[ref] = otherP
	
	ortholog_file.close()

	return ref_to_refP_ortholog_dict, ref_to_otherP_ortholog_dict

def get_x_to_y_best_locus_and_num_reads_for_best_locus(x_to_y_mapping_filename):
	x_to_y_mapping_file = open(x_to_y_mapping_filename, 'r')

	x_to_best_y = {}
	x_to_num_reads_for_best_y = {}

	x_dict = {}
	for line in x_to_y_mapping_file:
		line = line.rstrip('\n\r')
		field_list = line.split('\t')
		x = field_list[0]
		if x == 'query.sequence.name':
			## header line
			continue
		if x in x_dict:
			raise IOError, 'unexpected: saw x twice'
		x_dict[x] = 0
		best_y = field_list[1]
		num_reads_for_best_y = int(field_list[3])
		x_to_best_y[x] = best_y
		x_to_num_reads_for_best_y[x] = num_reads_for_best_y
	x_to_y_mapping_file.close()
	return x_to_best_y, x_to_num_reads_for_best_y

main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])
