#!/usr/bin/env python
#library_to_fraction_of_base_reads_that_come_from_global_gene_dict.py

import sys, helper_functions

if len(sys.argv) != 3:
	raise IOError, 'Usage: ./get_average_coverage_by_gene_and_library_from_parsed_pileup.py T<number>_parsed_pileup.txt gene_list_with_homologs_file'

# =========== These are for Tilling3: ==============
#global_gene_list = ['CsFAD2A','CsFAD2B','CsFAD2C','HLP1','KRP1607','KRP36','Os4g24710','Os5g47850','OsMIPS','OsPWD','OsRDR2','PIP','PITPK','SEX4','SUC4','GWD']
#global_library_list = raise IOError, 'This has not been initialized yet.'
# ========================================================================


# =========== These are ONLY for the WHEAT GENES from Tilling2: ==============
#global_gene_list = ['Tad_KRP-B4', 'Tu_WKRP5_A', 'As_WKRP5_B', 'Ta_FT_A', 'Ta_FT_B', 'Ta_FT_D', 'Tx_SYD', 'Td_Sbe-IIa_A', 'Td_Sbe-IIa_B', 'Ta_Sbe-IIa_A', 'Td_Ipk-1_A', 'Td_Ipk-1_B', 'Ta_Yr36-Kinase1_B', 'Ta_Mot1_A', 'Td_FDL2']
#global_library_list = ["T2R1", "T2R2", "T2R3", "T2R4", "T2R5", "T2R6", "T2R7", "T2R8","T2C1", "T2C2", "T2C3", "T2C6", "T2C7", "T2C8", "T2C9", "T2C10", "T2C11", "T2C12"]
# ========================================================================


global_header_field_list = ["refseq", "position", "library", "refbase", "A", "a", "T", "t", "C", "c", "G", "g", "comma", "dot", "coverage", \
				"FrAa", "FrTt", "FrCc", "FrGg", "SkewA", "SkewT", "SkewC", "SkewG", "MQ(Aa)", "MQ(Tt)", "MQ(Cc)", "MQ(Gg)", \
				"MQ(CommaDot)", "MQ(all)", "deltaQ-A", "deltaQ-T", "deltaQ-C", "deltaQ-G", \
				"Aa_HQ_and_LQ",	"Tt_HQ_and_LQ",	"Cc_HQ_and_LQ",	"Gg_HQ_and_LQ",	"ref_HQ_and_LQ", \
				"FrHQ_for_Aa", "FrHQ_for_Tt", "FrHQ_for_Cc", "FrHQ_for_Gg", "FrHQ_for_ref", "quality_cutoff"]

global_estimated_primer_length = 40


def main(parsed_pileup_filename, gene_list_with_homologs_filename):
	#if (gene_list_with_homologs_filename == "NULL") or (gene_list_with_homologs_filename == "NA"):
	#	gene_list = global_gene_list
	#else:
	#	gene_list = get_gene_list_from_gene_list_with_homologs_file(gene_list_with_homologs_filename)
	gene_list = helper_functions.get_gene_list_from_gene_list_with_homologs_file(gene_list_with_homologs_filename)
	#print str(gene_list)

	# initialize library list -- it will be filled in later on
	library_list = []

	lines_to_skip = 1
	print 'Mean coverage for each gene on each library and over all libraries:\n'
	for i in range(0, len(gene_list)):
		gene = gene_list[i]
		line_dictionary_list_for_each_position = helper_functions.read_parsed_pileup_file(parsed_pileup_filename, lines_to_skip, gene, global_header_field_list)

		# check for genes not in parsed pileup file
		if line_dictionary_list_for_each_position == {}:
			print 'Warning: Did not find gene %s in the parsed pileup file' % gene
			continue

		if i == 0:
			# read in list of libraries
			line_dictionary_list = line_dictionary_list_for_each_position[1]
			for line_dictionary in line_dictionary_list:
				library = line_dictionary["library"]
				if library in library_list:
					raise IOError, 'Error: unexpectedly saw the same library (%s) twice in the same line dictionary list' % library
				library_list.append(library)

			# print header
			header = 'Gene'
			for library in library_list:
				header += '\t' + library
			header += '\tAll'
			print header

		num_positions_in_gene = len(line_dictionary_list_for_each_position.keys())
		#print '%d positions in gene %s' % (num_positions_in_gene, gene)
		library_to_coverage_list_dict = {}
		coverage_list = []
		start_position = 1 + global_estimated_primer_length
		stop_position = num_positions_in_gene - global_estimated_primer_length
		for position in range(start_position, (stop_position + 1)):
			if not (position in line_dictionary_list_for_each_position[position]):
				print 'WARNING: Could not find mapping to TILLING sequence %s for position %d' % (gene, position)
			line_dictionary_list = line_dictionary_list_for_each_position[position]
			for line_dictionary in line_dictionary_list:
				coverage = int(line_dictionary["coverage"])
				library = line_dictionary["library"].rstrip(" ")
				if not (library in library_list):
					raise IOError, 'Found library (%s) that is not in predetermined library list' % library
				if not (library in library_to_coverage_list_dict):
					library_to_coverage_list_dict[library] = []
				library_to_coverage_list_dict[library].append(coverage)
				coverage_list.append(coverage)
		outline = gene
		for i in range(0, len(library_list)):
			library = library_list[i]
			mean_coverage_for_gene_in_library = float(sum(library_to_coverage_list_dict[library])) / float(len(library_to_coverage_list_dict[library]))
			outline += ('\t%.0f' % mean_coverage_for_gene_in_library)
			
		mean_coverage_for_gene = float(sum(coverage_list)) / float(len(coverage_list))
		outline += '\t%.0f' % mean_coverage_for_gene
		print outline
	return

def get_gene_list_from_gene_list_with_homologs_file(gene_list_with_homologs_filename):
	gene_list = []

	gene_list_with_homologs_file = open(gene_list_with_homologs_filename, 'r')
	for line in gene_list_with_homologs_file:
		if line == "":
			break
		line = line.strip()
		if (line == "") or (line[0:9] == 'Organism:'):
			continue
		if line[0] == '(':
			if line[len(line)-1] != ')':
				raise IOError, 'Expected (homolog1 homolog2 . . . homologN)'
			line = line[1:(len(line)-1)]
		homolog_list = line.split()
		for homolog in homolog_list:
			if homolog in gene_list:
				raise IOError, 'Unexpectedly found same gene (%s) appear more than once in gene_list_with_homologs_file' % homolog
			gene_list.append(homolog)
	return gene_list

def read_parsed_pileup_file(parsed_pileup_filename, lines_to_skip, gene_name):
	parsed_pileup_file = open(parsed_pileup_filename, 'r')
	for i in range(0, int(lines_to_skip)):
		parsed_pileup_file.readline()
	line_dictionary_list = []
	line_count = 0
	for line in parsed_pileup_file:
		line_count = line_count + 1
		#if (line_count % 10000) == 0:
		#	print line_count
		line_dictionary = helper_functions.get_dictionary_for_parsed_pileup_line(line)
		if line_dictionary["refseq"] == gene_name:
			line_dictionary_list.append(line_dictionary)
	line_dictionary_list_for_each_position = helper_functions.divide_line_dictionary_list_by_position(line_dictionary_list)
	return line_dictionary_list_for_each_position

def get_dictionary_for_parsed_pileup_line(line):
	line = line.rstrip('\n\r')
	line_field_list = line.split('\t')
	if len(line_field_list) != len(global_header_field_list):
		raise IOError, 'Each line must have one field for each field in the header'
	line_dictionary = {}
	for i in range(0, len(global_header_field_list)):
		line_dictionary[global_header_field_list[i]] = line_field_list[i]
	line_dictionary["library"] = line_dictionary["library"].rstrip(" ")
	return line_dictionary

def divide_line_dictionary_list_by_position(line_dictionary_list):
	line_dictionary_list_for_each_position = {}
	for line_dictionary in line_dictionary_list:
		position = int(line_dictionary["position"])
		if not (position in line_dictionary_list_for_each_position):
			line_dictionary_list_for_each_position[position] = []
		line_dictionary_list_for_each_position[position].append(line_dictionary)
	return line_dictionary_list_for_each_position


main(sys.argv[1], sys.argv[2])
