#!/usr/bin/env python
# create_scripts.py

import sys

if len(sys.argv) != 3:
	raise IOError, 'Usage: ./create_scripts.py tilling_run_number num_pooling_dimensions_string'

#global_list_of_methods_to_use = ['bayesian', 'alternate', 'z_scores']
global_list_of_methods_to_use = ['bayesian']

def create_scripts(tilling_run_number_string, num_pooling_dimensions_string):
	tilling_run_number = int(tilling_run_number_string)
	num_pooling_dimensions = int(num_pooling_dimensions_string)
	#library_to_URL_dict = get_library_to_URL_dict()
	#write_setup_script(library_to_URL_dict)
	#write_run_script(library_to_URL_dict)
	write_second_run_script(tilling_run_number, num_pooling_dimensions)
	print_instructions_to_user(tilling_run_number)
	return

def print_instructions_to_user(tilling_run_number):
	print '\n-------------------------------------------------------------'
	print 'Instructions to Run the Tilling Mutation Analysis Pipeline:'
	print '-------------------------------------------------------------\n'
	print 'Run the following commands to setup the directory structure, download the sequence files, and align the reads using MAQ:'
	print '> chmod 744 generated_setup_script.sh'
	print '> ./generated_setup_script.sh'
	print '> chmod 744 generated_run_script.sh'
	print '> ./generated_run_script.sh'
	print '\n'
	print 'After a parsed_pileup.txt file has been generated for each library (check for submitted jobs that are still running using qstat),'
	print 'run the following command to concatenate them all into one file:\n'
	print '> ./concatenate_all_parsed_pileups.sh <tilling_run_number>\n'
	print 'The above command depends upon the assumption that all libraries have names of the form T<number>[CR]<number>.\n\n'
	print 'Before running the second generated run script, please make sure that in the \'find_mutations.py\' script, the two options'
	print '\'global_FrHQ_threshold_standard_EMS_mutation_candidate\' and \'global_FrHQ_threshold_not_standard_EMS_mutation_candidate\''
	print 'are both set to 0.'
	print 'The option \'global_well_only\' should be set to True when running BA-THRESHER (method=\'bayesian\'),'
	print 'since this method is currently only set up to return \'well candidates.\''
	print '\nImportant: there are also a set of options in \'find_mutations.py\' that relate mainly to the pooling structure of the particular '
	print 'experiment in question: \'global_use_3D_pooling\', \'global_num_pools_per_row_library\', \'global_num_pools_per_column_library\', '
	print '\'global_num_pools_per_d_library\', \'global_num_individuals_per_pool\', \'global_number_of_libraries\', \'global_tilling_run_prefix\', '
	print '\'global_row_library_list\', \'global_column_library_list\', \'global_d_library_list\', \'global_extra_library_name_modifiers_list\', '
	print 'and \'global_read_length\'.  The default parameters have been set for the wheat experiment (Tilling5) described in the TILLING paper.'
	print 'Note that the 3D pooling option is still in the experimental stage.'
	print '\n\nIf these options are set as follows, then run the following commands to finish the analysis:'
	print '> chmod 744 generated_run_script_2.sh'
	print '> ./generated_run_script_2.sh\n\n'
	print 'All the results files should be created in the \'TillingN\' subdirectory where this script is located.\n'

	## Contamination test instructions - must eventually uncomment these:

	#print 'To test if one of the wells might contain a contaminant, run the following command:'
	#print '>./test_for_contamination.py T%d_wellonly_mutation_candidates_HQ_thresh_0_std_and_nonstd_contamination_test.txt num_rows num_columns' % tilling_run_number
	#print 'after substituting the number of rows and columns in the TILLING run for \'num_rows\' and \'num_columns\' respectively.\n'
	#print 'It also might be worth running the above command using \'T%d_wellonly_mutation_candidates_HQ_thresh_0.5_std_and_nonstd_contamination_test.txt\'' % tilling_run_number
	#print 'as the mutation candidates file.\n'
	return

def get_library_to_URL_dict():
	sequence_URL_file = open("sequence_URL_file", 'r')
	library_to_URL_dict = {}
	while True:
		## look for library name
		line, bool_eof = get_next_nonempty_line(sequence_URL_file)
		if bool_eof:
			# reached end of file
			break
       
		# check for URL being placed before library name
		if line[0:7] == "http://":
			raise IOError, 'Error: found URL where library name was expected'
	
		library = line
	
	
		## look for URL
		line, bool_eof = get_next_nonempty_line(sequence_URL_file)
		if bool_eof:
			raise IOError, 'Error: did not find URL corresponding to library: %s' % library
	
		if line[0:7] != "http://":
			raise IOError, 'Error: URL expected, but found line that does not begin with \'http://\': "%s"' % line
	
		URL = line
	
		library_to_URL_dict[library] = URL
	
	sequence_URL_file.close()
	return library_to_URL_dict

def write_setup_script(library_to_URL_dict):
	# generate the setup script
	print '\nGenerating setup script . . .'
	outfile = open("generated_setup_script.sh", 'w')
	outfile.write('#!/bin/sh\n\n')

	outfile.write('\n## copy required scripts from \'scripts\' folder\n\n')
	outfile.write('cp -f ../scripts/maq_setup_seqs.sh .\n')
	outfile.write('cp -f ../scripts/parse_pileup_revised_VVM2.m .\n')
	outfile.write('cp -f ../scripts/concatenate_all_parsed_pileups.sh .\n')
	outfile.write('cp -f ../scripts/base_change_frequency.py .\n')
	#outfile.write('cp -f ../scripts/get_average_frequency_of_each_base_change_from_parsed_pileup.py .\n')
	#outfile.write('cp -f ../scripts/get_average_coverage_by_gene_and_library_from_parsed_pileup.py .\n')
	#outfile.write('cp -f ../scripts/get_number_of_reads_for_each_library.sh .\n')
	outfile.write('cp -f ../scripts/stats.py .\n')
	outfile.write('cp -f ../scripts/test_for_contamination.py .\n\n')

	outfile.write('\n## convert tilling sequences to .bfa format\n\n')
	outfile.write('./maq_setup_seqs.sh Sequence_Files/tilling_seqs.txt\n')

	outfile.write('\n\n## create a directory for each library and download the sequences for each library\n\n')
	for library in library_to_URL_dict:
		URL = library_to_URL_dict[library]

		outfile.write('mkdir %s\n' % library)
		outfile.write('pushd %s\n' % library)
		outfile.write('wget -d %s\n' % URL)
		outfile.write('popd\n\n')
	outfile.close()
	return

def write_run_script(library_to_URL_dict):
	print 'Generating the first run script . . .'
	library_list = library_to_URL_dict.keys()
	library_string = ' '.join(library_list)
	
	outfile = open("generated_run_script.sh", 'w')
	outfile.write("#!/bin/sh\n\n")
	outfile.write("lane_list=( %s )\n\n" % library_string)
	outfile.write("for lane in ${lane_list[*]}; do\n")
	outfile.write("\tcd $lane\n")
	outfile.write("\techo $lane\n")
	outfile.write("\tcp -f ../ref.bfa .\n")
	outfile.write("\tcp -f ../../scripts/maq_prep.sh .\n")
	outfile.write("\tcp -f ../../scripts/maq_run.sh .\n")
	outfile.write("\tcp -f ../../scripts/octave_run.sh .\n")
	outfile.write("\tcp -f ../../scripts/run_all.sh .\n")
	outfile.write("\tchmod 744 *sh\n")
	outfile.write("\t./run_all.sh\n")
	outfile.write("\tcd ..\n")
	outfile.write("done\n")
	outfile.close()
	return

def write_second_run_script(tilling_run_number, num_pooling_dimensions):
	print 'Generating the second run script . . .'

	outfile = open("generated_run_script_2.sh", 'w')

	outfile.write("#!/bin/bash\n\n")
	outfile.write("#$ -cwd\n")
	outfile.write("#$ -S /bin/bash\n\n")
	#outfile.write("## get name of current directory\n")
	#outfile.write("current_path=`pwd`\n")
	#outfile.write("tilling_directory=${current_path##*/}\n")

	## Save old log file, and create a new log file

	outfile.write("if [ -e  LOGFILE-generated_run_script_2 ]; then\n");
	outfile.write("\tcp -p LOGFILE-generated_run_script_2 LOGFILE-generated_run_script_2-from_previous_run\n");
	outfile.write("fi\n");
	outfile.write("rm -f LOGFILE-generated_run_script_2\n");
	outfile.write("touch LOGFILE-generated_run_script_2\n");

	outfile.write("\n")
	outfile.write("num_pooling_dimensions=%d\n" % num_pooling_dimensions)

	## Run mutation detection tests

	threshold_list_for_bayesian_method = ['-10', '-5', '0', '5', '10', '15', '20']

	for method_to_use in global_list_of_methods_to_use:
		is_contamination_test = False
		threshold_for_method_string = 'NA'
		print_code_to_detect_mutation_candidates(outfile, tilling_run_number, is_contamination_test,
							method_to_use,
							threshold_for_method_string,
							num_pooling_dimensions)

	get_results_of_each_method_at_different_thresholds(outfile, tilling_run_number, threshold_list_for_bayesian_method,
							   num_pooling_dimensions)


	## Get overall statistics

	compute_base_change_frequency_and_coverage_statistics(outfile, tilling_run_number)


	#is_contamination_test = True
	#method_to_use = 'bayesian'
	#threshold_for_method_string = '${bayesian_method_thresh_with_most_predictions}'
	#print_code_to_detect_mutation_candidates(outfile, tilling_run_number, is_contamination_test,
	#					method_to_use,
	#					threshold_for_method_string,
	#					num_pooling_dimensions)

	## Compute orphan statistics from results of Bayesian method
	#method_to_use = 'bayesian'
	#threshold_for_method_string = '${bayesian_method_thresh_with_most_predictions}'
	#print_code_to_compute_orphan_statistics(outfile, tilling_run_number, method_to_use, threshold_for_method_string)

	return

def print_code_to_compute_orphan_statistics(outfile, tilling_run_number, method_to_use, threshold_for_method_string):
	mutation_candidates_directory_string='%s_method_threshold_%s' % (method_to_use, threshold_for_method_string)

	outfile.write("## Get statistics on orphan vs well candidate mutations at different cutoff thresholds for fraction of base change calls that are high quality\n")
	outfile.write("../find_mutations/get_orphan_statistics_from_mutation_candidates_file.py %s/T%d_mutation_candidates_HQ_thresh_0_std_and_nonstd.txt > %s/T%d_orphan_statistics_HQ_thresh_0_std_and_nonstd\n" \
		      % (mutation_candidates_directory_string, tilling_run_number, mutation_candidates_directory_string, tilling_run_number))
	outfile.write("../find_mutations/get_orphan_statistics_from_mutation_candidates_file.py %s/T%d_mutation_candidates_HQ_thresh_0.5_std_and_nonstd.txt > %s/T%d_orphan_statistics_HQ_thresh_0.5_std_and_nonstd\n\n" \
		      % (mutation_candidates_directory_string, tilling_run_number, mutation_candidates_directory_string, tilling_run_number))
	return

def compute_base_change_frequency_and_coverage_statistics(outfile, tilling_run_number):
	outfile.write("## Compute base change frequency and coverage statistics for parsed pileup file\n")
	outfile.write("cut -f 1 Parsed_Pileup/T%d_parsed_pileup.txt > Parsed_Pileup/T%d_parsed_pileup-column_1.txt\n" \
		      % (tilling_run_number, tilling_run_number))
	outfile.write("../scripts/get_unique_lines.py Parsed_Pileup/T%d_parsed_pileup-column_1.txt > Parsed_Pileup/T%d_parsed_pileup-reference_sequence_names.txt\n" \
		      % (tilling_run_number, tilling_run_number))
	outfile.write("../scripts/get_average_frequency_of_each_base_change_from_parsed_pileup.py Parsed_Pileup/T%d_parsed_pileup.txt Parsed_Pileup/T%d_parsed_pileup-reference_sequence_names.txt > T%d_average_frequency_of_each_base_change\n" \
		      % (tilling_run_number, tilling_run_number, tilling_run_number))
	outfile.write("../scripts/get_average_coverage_by_gene_and_library_from_parsed_pileup.py Parsed_Pileup/T%d_parsed_pileup.txt Parsed_Pileup/T%d_parsed_pileup-reference_sequence_names.txt > T%d_average_coverage_by_gene_and_library\n\n" %
		      (tilling_run_number, tilling_run_number, tilling_run_number))
	outfile.write("## Record number of reads for each library\n")
	outfile.write("../scripts/get_number_of_reads_for_each_library.sh > T%d_number_of_reads_for_each_library\n" % tilling_run_number)
	outfile.close()
	return


def print_code_to_detect_mutation_candidates(outfile, tilling_run_number, is_contamination_test, \
						method_to_use, \
						threshold_for_method_string, \
						num_pooling_dimensions):

	if is_contamination_test:
		is_contamination_test_string = 'true'
		mutation_candidates_file_postfix_string = '_contamination_test'
	else:
		is_contamination_test_string = 'false'
		mutation_candidates_file_postfix_string = ''
	
	mutation_candidates_directory_string='%s_method_threshold_%s' % (method_to_use, threshold_for_method_string)
	outfile.write("\nmkdir %s\n\n" % mutation_candidates_directory_string)

	outfile.write("current_tilling_directory=`pwd`\n")
	outfile.write("../find_mutations/generate_find_mutations_file.py List_of_genes_and_homologs_by_organism %d ${current_tilling_directory} %s %s %s > ../find_mutations/find_Tilling%d_mutations_generated_%s_method_threshold_%s%s.sh\n" \
		      % (tilling_run_number, is_contamination_test_string, method_to_use, threshold_for_method_string,
			 tilling_run_number, method_to_use, threshold_for_method_string, mutation_candidates_file_postfix_string))
	outfile.write("chmod 744 ../find_mutations/find_Tilling%d_mutations_generated_%s_method_threshold_%s%s.sh\n\n" \
		      % (tilling_run_number, method_to_use, threshold_for_method_string, mutation_candidates_file_postfix_string))
	outfile.write("## The following commands assume that the options \'global_FrHQ_threshold_standard_EMS_mutation_candidate\' and \n")
	outfile.write("## \'global_FrHQ_threshold_not_standard_EMS_mutation_candidate\', in the script \'TILLING/find_mutations/find_mutations.py\'\n")
	outfile.write("## are both set to 0, and that the option \'global_well_only\' is set to False:\n")
	outfile.write("pushd ../find_mutations\n")
	outfile.write("./find_Tilling%d_mutations_generated_%s_method_threshold_%s%s.sh > ${current_tilling_directory}/%s/T%d_mutation_candidates_HQ_thresh_0_std_and_nonstd%s.txt\n" \
		      % (tilling_run_number, method_to_use, threshold_for_method_string, mutation_candidates_file_postfix_string, \
			 mutation_candidates_directory_string, tilling_run_number, mutation_candidates_file_postfix_string))
	outfile.write("popd\n")
	outfile.write("../find_mutations/select_well_only_mutations_from_mutation_candidates_file.py %s/T%d_mutation_candidates_HQ_thresh_0_std_and_nonstd%s.txt $num_pooling_dimensions > %s/T%d_wellonly_mutation_candidates_HQ_thresh_0_std_and_nonstd%s.txt\n\n" \
		      % (mutation_candidates_directory_string, tilling_run_number, mutation_candidates_file_postfix_string, \
			 mutation_candidates_directory_string, tilling_run_number, mutation_candidates_file_postfix_string))
	#outfile.write("../find_mutations/select_well_and_orphan_mutation_candidates_by_FrHQ_for_Nn_threshold.py %s/T%d_mutation_candidates_HQ_thresh_0_std_and_nonstd%s.txt %s/T%d_mutation_candidates_HQ_thresh_0.5_std_and_nonstd%s.txt 0.5\n" \
	#	      % (mutation_candidates_directory_string, tilling_run_number, mutation_candidates_file_postfix_string, \
	#		 mutation_candidates_directory_string, tilling_run_number, mutation_candidates_file_postfix_string))
	#outfile.write("../find_mutations/select_well_and_orphan_mutation_candidates_by_FrHQ_for_Nn_threshold.py %s/T%d_wellonly_mutation_candidates_HQ_thresh_0_std_and_nonstd%s.txt %s/T%d_wellonly_mutation_candidates_HQ_thresh_0.5_std_and_nonstd%s.txt 0.5\n\n" \
	#	      % (mutation_candidates_directory_string, tilling_run_number, mutation_candidates_file_postfix_string, \
	#		 mutation_candidates_directory_string, tilling_run_number, mutation_candidates_file_postfix_string))
	return

def get_results_of_each_method_at_different_thresholds(outfile, tilling_run_number, threshold_list_for_bayesian_method, \
							num_pooling_dimensions):
	mutation_candidates_file_postfix_string = ''

	outfile.write("bayesian_thresh_list=( %s )\n" % ' '.join(threshold_list_for_bayesian_method));
	outfile.write("\n")
	outfile.write("mkdir well_predictions_at_FrHq_thresh_0_std_and_nonstd\n")
	outfile.write("\n")
	outfile.write("for thresh in ${bayesian_thresh_list[*]}; do\n");
	outfile.write("\tif [ thresh == NA ]; then\n");
	outfile.write("\t\techo 'Error: should not use NA as the thresh for the bayesian method at this part of the shell script'\n");
	outfile.write("\t\texit 1\n");
	outfile.write("\tfi\n");

	outfile.write("\t\n");
	outfile.write("\tmkdir bayesian_method_threshold_${thresh}\n");
	#outfile.write("\tbest_thresh_is_high=false\n");
	outfile.write("\tbest_thresh_is_high=true\n");
	outfile.write("\t../scripts/transform_scores_for_bayesian_method_candidates_file.py bayesian_method_threshold_NA/T%d_wellonly_mutation_candidates_HQ_thresh_0_std_and_nonstd%s.txt $num_pooling_dimensions > bayesian_method_threshold_NA/T%d_wellonly_mutation_candidates_HQ_thresh_0_std_and_nonstd%s-transformed_scores.txt\n" % (tilling_run_number, mutation_candidates_file_postfix_string, tilling_run_number, mutation_candidates_file_postfix_string));
	outfile.write("\tmutation_calling_option=t\n");
	outfile.write("\toutput_option=q\n");
	outfile.write("\t../scripts/select_top_mutations_from_mutation_candidates_file.py bayesian_method_threshold_NA/T%d_wellonly_mutation_candidates_HQ_thresh_0_std_and_nonstd%s-transformed_scores.txt threshold ${thresh} $best_thresh_is_high $mutation_calling_option $output_option $num_pooling_dimensions bayesian_method_threshold_${thresh}/T%d_wellonly_mutation_candidates_HQ_thresh_0_std_and_nonstd%s-transformed_scores.txt\n" % (tilling_run_number, mutation_candidates_file_postfix_string, tilling_run_number, mutation_candidates_file_postfix_string));
	outfile.write("\t\n");

	## Use FrHq_for_Nn_threshold = 0
	outfile.write("\twc_list=( `wc -l bayesian_method_threshold_${thresh}/T%d_wellonly_mutation_candidates_HQ_thresh_0_std_and_nonstd%s-transformed_scores.txt` )\n" \
		      % (tilling_run_number, mutation_candidates_file_postfix_string));
	outfile.write("\tnum_predictions_for_thresh=${wc_list[0]}\n");
	outfile.write("\t\n")
	outfile.write("\tcp -p bayesian_method_threshold_${thresh}/T%d_wellonly_mutation_candidates_HQ_thresh_0_std_and_nonstd%s-transformed_scores.txt well_predictions_at_FrHq_thresh_0_std_and_nonstd/bayesian_${thresh}\n" \
		      % (tilling_run_number, mutation_candidates_file_postfix_string))
	outfile.write("\t\n")
	outfile.write("\t## Get the results for the remaining methods when making the same number of predictions as the Bayesian method\n")
	global_list_of_methods_to_use_besides_bayesian = [method for method in global_list_of_methods_to_use]
	global_list_of_methods_to_use_besides_bayesian.remove('bayesian')
	outfile.write("\tfor method in %s; do\n" % ' '.join(global_list_of_methods_to_use_besides_bayesian))
	outfile.write("\t\tfrom_directory=${method}_method_threshold_NA\n")
	outfile.write("\t\ttemp_directory=${method}_method_temp\n")
	outfile.write("\t\tmutation_calling_option=t\n")
	outfile.write("\t\toutput_option=q\n")
	outfile.write("\t\tmkdir $temp_directory\n")
	outfile.write("\t\tbest_thresh_is_high=true\n")
	outfile.write(("\t\tresult_list=( `../scripts/select_top_mutations_from_mutation_candidates_file.py ${from_directory}/T%d_wellonly_mutation_candidates_HQ_thresh_0_std_and_nonstd%s.txt" \
		       + " number ${num_predictions_for_thresh} $best_thresh_is_high $mutation_calling_option $output_option" \
		       + " $num_pooling_dimensions ${temp_directory}/T%d_wellonly_mutation_candidates_HQ_thresh_0_std_and_nonstd%s.txt` )\n") \
		      % (tilling_run_number, mutation_candidates_file_postfix_string, tilling_run_number, mutation_candidates_file_postfix_string))
	outfile.write("\t\tthreshold_that_method_stopped_at=${result_list[0]}\n");
        outfile.write("\t\t\n")
	outfile.write("\t\tto_directory=${method}_method_threshold_${threshold_that_method_stopped_at}\n")
        outfile.write("\t\tif [ -e $to_directory ]; then\n")
        outfile.write("\t\t\trm -r $to_directory\n")
        outfile.write("\t\tfi\n")
        outfile.write("\t\t\n")
        outfile.write("\t\tmv $temp_directory $to_directory\n")
        outfile.write("\t\t\n")
        outfile.write("\t\t# print any error message\n")
	outfile.write("\t\tif [ ${#result_list[*]} == 2 ]; then\n");
	outfile.write("\t\t\techo ${result_list[1]} >> LOGFILE-generated_run_script_2\n");
	outfile.write("\t\tfi\n");
        outfile.write("\t\t\n")
        outfile.write("\t\twc_list=( `wc -l ${to_directory}/T%d_wellonly_mutation_candidates_HQ_thresh_0_std_and_nonstd%s.txt` )\n" \
		      % (tilling_run_number, mutation_candidates_file_postfix_string))
        outfile.write("\t\tnum_predictions_returned_by_method=${wc_list[0]}\n")
        outfile.write("\t\t\n")
	outfile.write("\t\tcp -p ${to_directory}/T%d_wellonly_mutation_candidates_HQ_thresh_0_std_and_nonstd%s.txt well_predictions_at_FrHq_thresh_0_std_and_nonstd/${method}_${num_predictions_returned_by_method}\n" \
		      % (tilling_run_number, mutation_candidates_file_postfix_string));
        outfile.write("\t\techo ${method} method returns ${num_predictions_returned_by_method} predictions by breaking at threshold ${threshold_that_method_stopped_at} >> LOGFILE-generated_run_script_2\n")
	outfile.write("\tdone\n")
	outfile.write("done\n\n");
	return

def get_next_nonempty_line(file):
	bool_eof = False
	while True:
		line = file.readline()
		if line == "":
			# end of file
			bool_eof = True
			break
	
		# strip whitespace
		line = line.strip()

		# check for nonempty line
		if line != "":
			break

	return line, bool_eof

create_scripts(sys.argv[1], sys.argv[2])
