#!/usr/bin/env python
# generate_find_mutations_file.py

import sys

global_output_option="standard_output"
#global_output_option="parsed_pileup_subset"

if len(sys.argv) != 7:
	raise IOError, 'Usage: ./generate_find_mutations_file.py list_of_genes_and_homologs_by_organism_file tilling_run_number path_to_current_tilling_directory is_contamination_test method_to_use threshold_for_method > generated_file'

def generate_file(infilename, tilling_run_number_string, path_to_current_tilling_directory, is_contamination_test_string, method_to_use, threshold_for_method_string):
	infile = open(infilename, 'r')
	tilling_run_number = int(tilling_run_number_string)
	print_header(path_to_current_tilling_directory)
	print_constants(path_to_current_tilling_directory, is_contamination_test_string, method_to_use, threshold_for_method_string, global_output_option)
	print_calls_to_find_mutations_sh_script(infile, tilling_run_number, path_to_current_tilling_directory)
	infile.close()
	return

def print_header(path_to_current_tilling_directory):
	print "#!/bin/sh"
	path_to_current_tilling_directory = path_to_current_tilling_directory.rstrip('/')
	current_tilling_directory_path_list = path_to_current_tilling_directory.split('/')
	current_tilling_directory_name = current_tilling_directory_path_list[len(current_tilling_directory_path_list) - 1]
	print "# find_%s_mutations_generated.sh (generated automatically)\n" % current_tilling_directory_name
	return

def print_constants(path_to_current_tilling_directory, is_contamination_test_string, method_to_use, threshold_for_method_string, output_option):
	print "tilling_seq_file=%s/Sequence_Files/tilling_seqs.txt\n" % path_to_current_tilling_directory
	print "is_contamination_test=%s" % is_contamination_test_string
	print "method_to_use=%s" % method_to_use
	print "threshold_for_method=%s" % threshold_for_method_string
	print "output_option=%s\n" % output_option
	return

def print_calls_to_find_mutations_sh_script(infile, tilling_run_number, path_to_current_tilling_directory):
	processed_gene_organism_tuple_list = []
	while True:
		# Process genes for organism
		line = infile.readline()
		if line == "":
			break
		line = line.strip()
		if line == "":
			continue
		field_list = line.split()
		if (len(field_list) != 2) or (field_list[0].lower() != "organism:"):
			raise IOError, 'Format error: expected "Organism: <organism>"'
		organism = field_list[1]
		
		print ''
		print '#Organism: %s' % organism
		while True:
			# process each individual gene or list of homologs
			line = infile.readline()
			if line == "":
				# reached end of file
				break
			line = line.strip()
			if line == "":
				# reached last gene for organism
				break
			process_line(line, tilling_run_number, path_to_current_tilling_directory, organism, processed_gene_organism_tuple_list)

	## Wait until all jobs are finished running
	outline = 'while '
	for i in range(0, len(processed_gene_organism_tuple_list)):
		(gene, organism) = processed_gene_organism_tuple_list[i]
		if i != 0:
			outline += ' || '
		outline += '[ -e semaphore/gene_%s_organism_%s ]' % (gene, organism)
	outline += '; do'
	print outline
	print '\t# don\'t overload the head node'
	print '\tsleep 30m'
	print 'done'
	print 'sleep 30s'
	print ''
	print '# print candidates file for each gene'
	for i in range(0, len(processed_gene_organism_tuple_list)):
		(gene, organism) = processed_gene_organism_tuple_list[i]
		print 'cat intermediate_files/_candidates_for_gene_%s_from_organism_%s' % (gene, organism)
	for i in range(0, len(processed_gene_organism_tuple_list)):
		(gene, organism) = processed_gene_organism_tuple_list[i]
		print 'rm intermediate_files/_candidates_for_gene_%s_from_organism_%s' % (gene, organism)
	return

def process_line(line, tilling_run_number, path_to_current_tilling_directory, organism, processed_gene_organism_tuple_list):
	if (line[0] == '('):
		# list of homologs
		if line[len(line)-1] != ')':
			raise IOError, 'Homolog list should begin with "(" and end with ")"'
		line = line[1:(len(line)-1)]
		homolog_list = line.split()
		if len(homolog_list) < 2:
			raise IOError, 'Homolog list (%s) must contain at least two homologs'
		for gene in homolog_list:
			other_homologs_list = [homolog for homolog in homolog_list]
			other_homologs_list.remove(gene)
			print 'touch semaphore/gene_%s_organism_%s' % (gene, organism)
			processed_gene_organism_tuple_list.append((gene, organism))
			print 'rm -f intermediate_files/_candidates_for_gene_%s_from_organism_%s' % (gene, organism)
			outline = ("./find_mutations.sh %s/Parsed_Pileup/T%d_parsed_pileup.txt %s ${output_option} $tilling_seq_file" + \
					" $is_contamination_test $method_to_use $threshold_for_method %s" + \
					" > intermediate_files/_candidates_for_gene_%s_from_organism_%s") % \
					(path_to_current_tilling_directory, tilling_run_number, organism, gene, gene, organism)
			for homolog in other_homologs_list:
				outline += " %s" % homolog
			print outline
		print ''
	else:
		# single gene on line
		field_list = line.split()
		if len(field_list) != 1:
			raise IOError, 'Line did not look like list of homologs, so there must be exactly one gene on the line: (%s)' % line
		gene = field_list[0]
		print 'touch semaphore/gene_%s_organism_%s' % (gene, organism)
		processed_gene_organism_tuple_list.append((gene, organism))
		print 'rm -f intermediate_files/_candidates_for_gene_%s_from_organism_%s' % (gene, organism)
		outline = ("./find_mutations.sh %s/Parsed_Pileup/T%d_parsed_pileup.txt %s ${output_option}  $tilling_seq_file" + \
				" $is_contamination_test $method_to_use $threshold_for_method %s" + \
				" > intermediate_files/_candidates_for_gene_%s_from_organism_%s") % \
				(path_to_current_tilling_directory, tilling_run_number, organism, gene, gene, organism)
		print outline
		print ''
	return

generate_file(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6])
