#!/usr/bin/env python
# test_for_contamination.py
# Assumes that mutation candidates file does not contain a header

import sys, math, stats
if len(sys.argv) != 4:
	raise IOError, 'Usage: ./test_for_contamination.py mutation_candidates_file num_rows num_columns'

# pvalue threshold for printing overrepresented wells
global_pvalue_threshold = 0.1

def test_for_contamination(mutation_candidates_filename, num_rows_string, num_columns_string):
	num_rows = int(num_rows_string)
	num_columns = int(num_columns_string)
	num_wells = num_rows * num_columns

	mutation_candidates_file = open(mutation_candidates_filename, 'r')
	well_count_dict = {}
	num_well_candidates = 0
	for line in mutation_candidates_file:
		line = line.strip()
		field_list = line.split('\t')
		row_library = field_list[5]
		column_library = field_list[6]

		if (row_library != 'NA') and (column_library != 'NA'):
			# well mutation
			well_string = '%-16s' % ('(' + row_library.strip() + ', ' + column_library.strip() + ')')
			if not (well_string in well_count_dict):
				well_count_dict[well_string] = 0
			well_count_dict[well_string] += 1
			num_well_candidates += 1
	well_count_tuple_list = []
	for well in well_count_dict:
		num_occurrences = well_count_dict[well]
		well_count_tuple_list.append((num_occurrences, well))
	well_count_tuple_list.sort()
	well_count_tuple_list.reverse()

	print
	print 'Warning: Pvalues in column three are not corrected for multiple testing'
	print
	print '\t\t\t\t\t\t\tMultiple Testing Pvalues'
	print 'Well (Row, Column)\tCount\tPvalue*\tFWER pvalue**\tUpper bound pvalue***'
	for well_count_tuple in well_count_tuple_list:
		(num_occurrences, well) = well_count_tuple
		well_probability = 1.0 / float(num_wells)
		# compute probability that you would have gotten this many or more well candidates in the current well
		pvalue = 1 - stats.pbinom(num_occurrences - 1, num_well_candidates, well_probability)
		mult_test_pvalue_assume_independence = 1 - math.pow(1.0 - pvalue, num_wells)
		mult_pvalue_upper_bound = pvalue * num_wells
		if pvalue <= global_pvalue_threshold:
			print '%s\t%d\t%.3e\t%.3e\t%.3e' % (well, num_occurrences, pvalue, mult_test_pvalue_assume_independence, mult_pvalue_upper_bound)
	print
	print '* Column three lists the probability, for each well, that the well'
	print 'would have greater than or equal to the number of observed mutation'
	print 'candidates.'
	print 'Due to the statistics of multiple testing, the probability that one'
	print 'of the many wells will appear significant increases with the number'
	print 'of wells, which provides the motivation for the next two columns,'
	print 'which give an estimate of the probability that at least one well'
	print 'would have greater then or equal to the number of observed mutation'
	print 'candidates for the current well.'
	print
	print '** Assumes independence of one well having a significant pvalue from'
	print 'another well having a significant pvalue.  This is a false assumption,'
	print 'so the pvalue may not be completely accurate.'
	print
	print '*** Upper bound on the multiple testing pvalue.  If this is < 0.01,'
	print 'then the well is definitely overrepresented.'

test_for_contamination(sys.argv[1], sys.argv[2], sys.argv[3])
