#!/usr/bin/env python
#get_average_frequency_of_each_base_change_from_parsed_pileup.py

import sys, helper_functions
from base_change_frequency import *

if len(sys.argv) != 3:
	raise IOError, 'Usage: ./get_average_frequency_of_each_base_change_from_parsed_pileup.py T<number>_parsed_pileup.txt gene_list_with_homologs_file'


#global_gene_list = ['C7_CsFAD2A']

# Tilling3
#global_gene_list = ['CsFAD2A','CsFAD2B','CsFAD2C','HLP1','KRP1607','KRP36','Os4g24710','Os5g47850','OsMIPS','OsPWD','OsRDR2','PIP','PITPK','SEX4','SUC4','GWD']

# Tilling4
#global_gene_list = ['SUC4-1', 'OsKRP4-1_36-1111', 'OsKRP4-2_1607-3103', 'tGZ1a', 'tGZ1b', 'tSPDS1a', 'tSPDS1b', 'tSPDS1c', 'tgu5', 'tgt2a', 'tgt1a', 'tgf5a', 'tgf3a', 'tgf2b', 'tgf2c', 'Cfm4_591-1756', 'Cfm7_1897-637', 'MrOsCrs2_2143-3397', 'MR-Os-MrT-1658-2927', 'OsBlac_55735-70333', 'OsBlac_9915-23083', 'TIL_OsHP1a', 'TIL_OsHP2a', 'CENH3_522-525', 'Os02g07350-2', 'Os02g57400-1', 'Os02g57400-2', 'Os02g31960-3', 'MCM21-At5g10710_CP479-CP480', 'CENP-C-At1g15660_CP481-CP482']

# Tilling5
#global_gene_list = ['Os02g07350-3', 'Os02g31960-2', 'Os02g31960-4', 'Os02g57400-3', 'Os02g57400-4', 'Os03g04920-2', 'Os03g04920-3', 'Os03g04920-4', 'Os03g04920-5', 'Os03g39000-1', 'Os04g56580-1', 'Os04g56580-2', 'Os04g56580-3', 'Os07g09330-3', 'Os07g37220-2', 'Os07g37220-3', 'Os09g34300-1', 'Os09g34300-2', 'Os09g34300-3', 'Os09g39870-1', 'Os09g39870-2', 'Os10g42550-1', 'Os10g42550-2', 'Os01g08780-1', 'Os01g08780-3', 'Os01g59880-3', 'Os01g59880-4', 'Os02g27620-1', 'Os03g42810-1', 'Os03g42810-2', 'OsKRP1.2a', 'OsKRP2.0a', 'OsKRP5.0', 'A_PHYC', 'A_VRN3', 'B_FT2-1', 'B_FT2-2', 'VRN1_B1']

global_header_field_list = ["refseq", "position", "library", "refbase", "A", "a", "T", "t", "C", "c", "G", "g", "comma", "dot", "coverage", \
				"FrAa", "FrTt", "FrCc", "FrGg", "SkewA", "SkewT", "SkewC", "SkewG", "MQ(Aa)", "MQ(Tt)", "MQ(Cc)", "MQ(Gg)", \
				"MQ(CommaDot)", "MQ(all)", "deltaQ-A", "deltaQ-T", "deltaQ-C", "deltaQ-G", \
				"Aa_HQ_and_LQ",	"Tt_HQ_and_LQ",	"Cc_HQ_and_LQ",	"Gg_HQ_and_LQ",	"ref_HQ_and_LQ", \
				"FrHQ_for_Aa", "FrHQ_for_Tt", "FrHQ_for_Cc", "FrHQ_for_Gg", "FrHQ_for_ref", "quality_cutoff"]

def main(parsed_pileup_filename, gene_list_with_homologs_filename):
	lines_to_skip = 1

	refbase_list = ['A', 'C', 'T', 'G']

	refbase_to_total_number_of_HQ_base_reads, refbase_to_number_of_HQ_base_reads_for_newbase_dict, \
		refbase_to_total_number_of_HQ_and_LQ_base_reads, refbase_to_number_of_HQ_and_LQ_base_reads_for_newbase_dict \
		= initialize_base_change_frequency_statistics_dictionaries(refbase_list)


	gene_list = get_gene_list_from_gene_list_with_homologs_file(gene_list_with_homologs_filename)

	for gene in gene_list:
		#print 'DEBUG: reading gene %s . . . ' % gene
		line_dictionary_list_for_each_position = helper_functions.read_parsed_pileup_file(parsed_pileup_filename, lines_to_skip, gene, global_header_field_list)

		# check for genes not in parsed pileup file
		if line_dictionary_list_for_each_position == []:
			continue

		update_base_change_frequency_statistics_by_information_for_gene(line_dictionary_list_for_each_position,
									 refbase_to_total_number_of_HQ_base_reads,
									 refbase_to_total_number_of_HQ_and_LQ_base_reads,
									 refbase_to_number_of_HQ_base_reads_for_newbase_dict,
									 refbase_to_number_of_HQ_and_LQ_base_reads_for_newbase_dict)
	print ' === Total number of base reads for each refbase, before and after throwing out low quality base reads ==='
	for refbase in refbase_list:
		print 'Refbase\tHQ & LQ\tHQ only'
		print '%s\t%d\t%d' % (refbase, refbase_to_total_number_of_HQ_and_LQ_base_reads[refbase], refbase_to_total_number_of_HQ_base_reads[refbase])
	print ''
	print '  === Base change rates by refbase, before and after throwing out low quality base reads ==='
	for refbase in refbase_list:
		print ''
		newbase_list = [base for base in refbase_list]
		newbase_list.remove(refbase)
		first_loop = True
		for newbase in newbase_list:
			newbase_fraction_HQ = (float(refbase_to_number_of_HQ_base_reads_for_newbase_dict[refbase][newbase]) / \
					       float(refbase_to_total_number_of_HQ_base_reads[refbase]))
			newbase_fraction_HQ_and_LQ = (float(refbase_to_number_of_HQ_and_LQ_base_reads_for_newbase_dict[refbase][newbase]) / \
					    	      float(refbase_to_total_number_of_HQ_and_LQ_base_reads[refbase]))
			if first_loop:
				print (refbase + ' > ' + newbase + '\t%.5f\t%.5f') % (newbase_fraction_HQ_and_LQ, newbase_fraction_HQ)
				first_loop = False
			else:
				print ('    ' + newbase + '\t%.5f\t%.5f') % (newbase_fraction_HQ_and_LQ, newbase_fraction_HQ)
	return

def get_gene_list_from_gene_list_with_homologs_file(gene_list_with_homologs_filename):
	gene_list = []

	gene_list_with_homologs_file = open(gene_list_with_homologs_filename, 'r')
	for line in gene_list_with_homologs_file:
		if line == "":
			break
		line = line.strip()
		if (line == "") or (line[0:9] == 'Organism:'):
			continue
		if line[0] == '(':
			if line[len(line)-1] != ')':
				raise IOError, 'Expected (homolog1 homolog2 . . . homologN)'
			line = line[1:(len(line)-1)]
		homolog_list = line.split()
		for homolog in homolog_list:
			if homolog in gene_list:
				raise IOError, 'Unexpectedly found same gene (%s) appear more than once in gene_list_with_homologs_file' % homolog
			gene_list.append(homolog)
	return gene_list

main(sys.argv[1], sys.argv[2])
