#!/bin/sh


#Inputs:

#1. BGI 93-11 cDNA reference sequences
#2. MSU Nipponbare genomic reference sequence and gene model annotations

#Plan:

#1. For each 93-11 cDNA sequence (I may use a subset of sequences, if the mapping takes a long time):
#a. randomly select 100 reads
#b. map them to the Nipponbare reference
#c. tabulate read counts for each locus (including multiple alignments)
#
#2. For each of a series of values for the required number of read alignments to call a homology between a 93-11 cDNA and a Nipponbare locus, report:
#a. the number of 93-11 cDNAs that map to a single Nipponbare locus
#b. the number of 93-11 cDNAs that do not map to any Nipponbare locus
#c. the number of Nipponbare loci that are not mapped to be any 93-11 cDNAs


if [ $# -ne 13 ]; then
	echo Usage: ./genome_mapping_bias.sh query_fasta_file query_sequences_to_select_file num_query_sequences_to_select num_reads_to_select_per_query_sequence coverage_per_query_sequence read_length bowtie_index_basename target_sequence_feature_annotations_file output_directory method PE_reads sequencing_noise_rate insert_length
	exit
fi

query_fasta_file=$1
query_sequences_to_select_file=$2
num_query_sequences_to_select=$3
num_reads_to_select_per_query_sequence=$4
coverage_per_query_sequence=$5
read_length=$6
bowtie_index_basename=$7
target_sequence_feature_annotations_file=$8
output_directory=$9
method=${10}
PE_reads=${11}
sequencing_noise_rate=${12}
insert_length=${13}

temp_directory=${output_directory}/temp
temp_tophat_output_directory=${temp_directory}/tophat
temp_bwa_output_directory=${temp_directory}/bwa
temp_stampy_output_directory=${temp_directory}/stampy
temp_read_counts_per_locus_directory=${temp_directory}/read_counts_per_locus

temp_single_query_sequence_reads_file_mate_pair_1=${temp_directory}/reads_file_pair_1
temp_single_query_sequence_reads_file=${temp_directory}/reads_file_pair_2
temp_combined_reads_file_mate_pair_1=${temp_directory}/combined_reads_file_mate_pair_1
temp_combined_reads_file_mate_pair_2=${temp_directory}/combined_reads_file_mate_pair_2
temp_sam_hits_file=${temp_directory}/accepted_hits.sam
temp_sequence_name_list_file=${temp_directory}/sequence_name_list_file

temp_pipeline_log_file=${temp_directory}/pipeline_log_file
temp_empty_SNP_file=${temp_directory}/empty_SNP_file


source pipeline_helper_functions.sh

# create required directories
directory_list=( ${output_directory} ${temp_directory} ${temp_tophat_output_directory} ${temp_bwa_output_directory} ${temp_stampy_output_directory} ${temp_read_counts_per_locus_directory} )
create_directories_in_directory_list

# create required files to support the pipeline run
#rm -f ${temp_pipeline_log_file}
#touch ${temp_pipeline_log_file}

#rm -f ${temp_empty_SNP_file}
#touch ${temp_empty_SNP_file}

rm -f ${temp_combined_reads_file_mate_pair_1}
touch ${temp_combined_reads_file_mate_pair_1}
rm -f ${temp_combined_reads_file_mate_pair_2}
touch ${temp_combined_reads_file_mate_pair_2}

# record path to Bowtie indices
current_working_directory=`pwd`
export BOWTIE_INDEXES=${current_working_directory}/bowtie_indices/${bowtie_index_basename}.ebwt/

echo In genome_mapping_bias.sh: Randomly selecting $num_query_sequences_to_select sequence names from fasta file ${query_fasta_file}
if [ $num_query_sequences_to_select != NA ]; then
	./randomly_print_n_sequence_names_from_fasta_file_to_output_file.py ${num_query_sequences_to_select} ${query_fasta_file} ${temp_sequence_name_list_file}
elif [ $query_sequences_to_select_file != NA ]; then
	if [ ! -e $query_sequences_to_select_file ]; then 
		echo Unexpected: specified query_sequences_to_select_file \'${query_sequences_to_select_file}\' does not exist
		exit
	fi
	echo selecting sequences from file $query_sequences_to_select_file
	cp -rp ${query_sequences_to_select_file} ${temp_sequence_name_list_file}
else
	echo Unexpected: both parameters num_query_sequences_to_select and query_sequences_to_select_file are set to NA
	exit
fi
sequence_name_list=( `cat ${temp_sequence_name_list_file}` )

if [ $PE_reads == false ]; then
	## do stuff
else
	echo Paired ends reads are not currently supported, so the variable PE_reads must be set to \'false\'
	exit
fi
if [ ${num_reads_to_select_per_query_sequence} != NA ] && [ ${coverage_per_query_sequence} == NA ]; then
	./print_n_random_reads_of_length_x_from_subset_of_sequences_in_fasta_file_to_output_file.py ${num_reads_to_select_per_query_sequence} ${read_length} ${temp_sequence_name_list_file} ${query_fasta_file} ${temp_combined_reads_file_mate_pair_1}
elif [ ${num_reads_to_select_per_query_sequence} == NA ] && [ ${coverage_per_query_sequence} != NA ]; then
	./print_n_random_reads_of_length_x_from_subset_of_sequences_in_fasta_file_to_output_file.py ${coverage_per_query_sequence} ${read_length} ${temp_sequence_name_list_file} ${query_fasta_file} ${temp_combined_reads_file_mate_pair_1}
else
	echo It should always be the case that exactly one of the variables \'num_reads_to_select_per_query_sequence\' and \'coverage_per_query_sequence\' should be NA
	exit
fi

## parameters for "run_tophat_on_reads_file_to_generate_sam_hits_file"
reads_file=${temp_combined_reads_file_mate_pair_1}
sam_hits_file=${temp_sam_hits_file}
tophat_output_directory=${temp_tophat_output_directory}
bwa_output_directory=${temp_bwa_output_directory}
stampy_output_directory=${temp_stampy_output_directory}
gene_model_annotations_GFF3_file=${target_sequence_feature_annotations_file}
bowtie_index_basename=${bowtie_index_basename}

# make sure we generate a new sam hits file
rm -rf ${sam_hits_file}
rm -rf ${sam_hits_file}.gz

genomic_reference_fasta_file=bowtie_indices/o_sativa.ebwt/o_sativa.fa

# generate sam hits file
if [ ${method} == stampy ]; then
	run_stampy_on_reads_file_to_generate_sam_hits_file
elif [ ${method} == tophat ]; then 
	run_tophat_on_reads_file_to_generate_sam_hits_file
elif [ ${method} == bwa ]; then
	run_bwa_on_reads_file_to_generate_sam_hits_file
else
	echo Bad method: $method
	exit
fi


temp_sam_headers_file=${temp_directory}/temp_headers.sam
temp_sam_body_file=${temp_directory}/temp_body.sam

grep -e '@' ${sam_hits_file} > ${temp_sam_headers_file}
grep -v '@' ${sam_hits_file} > ${temp_sam_body_file}

cp -rp ${temp_sam_headers_file} ${sam_hits_file}
sort ${temp_sam_body_file} >> ${sam_hits_file}

./split_sam_hits_file.py ${sam_hits_file} ${temp_directory}

file_listing_sam_hits_files=${temp_directory}/file_listing_sam_hits_files
file_listing_statistics_per_locus_output_files=${temp_directory}/file_listing_statistics_per_locus_output_files

rm -rf ${file_listing_sam_hits_files}
rm -rf ${file_listing_statistics_per_locus_output_files}
touch ${file_listing_sam_hits_files}
touch ${file_listing_statistics_per_locus_output_files}

for sequence_name in ${sequence_name_list[*]}; do
	echo sequence_name: $sequence_name
	temp_sam_hits_file_for_query_sequence=${temp_directory}/accepted_hits_for_${sequence_name}.sam
	statistics_per_locus_output_file=${temp_read_counts_per_locus_directory}/${sequence_name}_statistics_per_locus.txt
	echo sam hits file: ${temp_sam_hits_file_for_query_sequence}

	echo ${temp_sam_hits_file_for_query_sequence} >> ${file_listing_sam_hits_files}
	echo ${statistics_per_locus_output_file} >> ${file_listing_statistics_per_locus_output_files}

	if [ ! -e ${temp_sam_hits_file_for_query_sequence} ]; then
		cp -rp ${temp_sam_headers_file} ${temp_sam_hits_file_for_query_sequence}
	fi
done
analysis_option=memory_efficient
./statistics_per_locus_for_multiple_separate_analyses.py ${file_listing_sam_hits_files} $target_sequence_feature_annotations_file $temp_empty_SNP_file ${analysis_option} ${file_listing_statistics_per_locus_output_files} $temp_pipeline_log_file

arguments_file=get.genome.mapping.bias.arguments.file
rm -f ${arguments_file}
touch ${arguments_file}
echo ${temp_read_counts_per_locus_directory} >> ${arguments_file}
echo ${output_directory} >> ${arguments_file}
for sequence_name in ${sequence_name_list[*]}; do
	echo ${sequence_name} >> ${arguments_file}
done
R CMD BATCH get.genome.mapping.bias.R
