#!/bin/sh
# mRNA-seq pipeline
# by Victor Missirian


###############################################################################
## Important Notes for User:
###############################################################################

# 1. Examples for <bowtie_index_basename>: "a_thaliana", "o_sativa"


###############################################################################
## Pipeline Notes:
###############################################################################

# 1. does not currently search for new transcripts or exon junctions
# 2. assumes Phred-scaled base-64 quality scores (also mentioned in the README)


###############################################################################
## Notes to self:
###############################################################################

# 1. Still need to implement the option of returning a representative sample of the differentially-expressed loci
# 2. Is it possible to compute the DEF of each locus in the case where we have biological replicates?
# 3. For GFF3 gene annotation file:
# Most importantly, the values in the first column, which indicates the chromosome or contig on which the
# feature is located, must match a reference sequence record in the Bowtie index you are using
# with TopHat. You can get a list of the records in a Bowtie index by typing:
#
#    bowtie-inspect --names your_index 
#
#
# Must create directory "bowtie_indices" inside directory where pipeline script is run, and must place
# needed Bowtie indices in this directory.

if [ $# -ne 9 ]; then
	echo Usage: ./mRNA-seq_pipeline.sh condition_1 condition_2 top_level_reads_directory read_alignment_method bowtie_index_basename genomic_reference_fasta_file_for_bwa gene_model_annotations_GFF3_file SNP_filename reads_processing_output_directory
	exit
fi

condition_1=$1
condition_2=$2
top_level_reads_directory=$3
read_alignment_method=$4
bowtie_index_basename=$5
genomic_reference_fasta_file_for_bwa=$6
gene_model_annotations_GFF3_file=$7
SNP_filename=$8
reads_processing_output_directory=$9



## Directories

bio_rep_names_directory=${reads_processing_output_directory}/biological_replicate_names
tophat_output_directory=${reads_processing_output_directory}/tophat
bwa_output_directory=${reads_processing_output_directory}/bwa
read_counts_per_locus_directory=${reads_processing_output_directory}/read_counts_per_locus
differential_expression_directory=${reads_processing_output_directory}/differential_expression
temp_directory=temp

reads_directory_c1=${top_level_reads_directory}/${condition_1}
reads_directory_c2=${top_level_reads_directory}/${condition_2}


## Temporary Files

pipeline_log_file=${temp_directory}/_temp_pipeline_log

conditions_file=${temp_directory}/_temp_conditions_file_${condition_1}_vs_${condition_2}

read_counts_per_allele_file=${temp_directory}/_temp_read_counts_per_allele_file_${condition_1}_vs_${condition_2}

read_counts_per_locus_file_1=${temp_directory}/_temp_read_counts_file_${condition_1}
read_counts_per_locus_file_2=${temp_directory}/_temp_read_counts_file_${condition_2}

read_counts_per_locus_file=${temp_directory}/_temp_read_counts_file_${condition_1}_vs_${condition_2}
	
# output file for Fisher's exact test script
DE_loci_summary_file=${temp_directory}/_temp_DE_loci_summary_${condition_1}_vs_${condition_2}


source pipeline_helper_functions.sh

run_mRNA_seq_pipeline () {
	## Create required directories

	directory_list=( $differential_expression_directory $reads_processing_output_directory ${bio_rep_names_directory} $tophat_output_directory ${bwa_output_directory} $read_counts_per_locus_directory $temp_directory )
	create_directories_in_directory_list


	## process reads for each condition

	condition=${condition_1}
	reads_directory=${reads_directory_c1}
	process_reads_for_condition

	condition=${condition_2}
	reads_directory=${reads_directory_c2}
	process_reads_for_condition


	## Determine number of biological replicates for each condition

	c1_bio_rep_list=( `cat ${bio_rep_names_directory}/biological_replicate_names_for_${condition_1}` )
	c2_bio_rep_list=( `cat ${bio_rep_names_directory}/biological_replicate_names_for_${condition_2}` )
	num_bio_reps_c1=${#c1_bio_rep_list[*]}
	num_bio_reps_c2=${#c2_bio_rep_list[*]}
	

	## Get information about which statistics per locus files correspond
	## to which conditions
	
	get_conditions_string_and_statistics_per_locus_files_string


	## Get statistics for each locus on each biological replicate

	echo ./create_read_counts_file_for_all_replicates.py read_counts ${read_counts_per_locus_file} ${statistics_per_locus_files_string}
	./create_read_counts_file_for_all_replicates.py read_counts ${read_counts_per_locus_file} ${statistics_per_locus_files_string}
	./create_read_counts_file_for_all_replicates.py read_counts_per_allele ${read_counts_per_allele_file} ${statistics_per_locus_files_string}

	## Identify Differentially-expressed Loci
	final_results_file=${differential_expression_directory}/results_${condition_1}_vs_${condition_2}
	if [ $num_bio_reps_c1 -lt 1 -o $num_bio_reps_c2 -lt 1 ]; then
		echo Error: Expected that there would be at least one biological replicate for each condition
		exit
	elif [ $num_bio_reps_c1 == 1 -a $num_bio_reps_c2 == 1 ]; then
		## Run Fisher's Exact Test
		
		echo There is exactly one biological replicate for each condition, so we are identifying differentially expressed loci using Fisher\'s exact test
		differential_expression_method_used_to_compare_conditions=FET
		
		statistics_per_locus_files_list=( ${statistics_per_locus_files_string} )
		num_statistics_per_locus_files=${#statistics_per_locus_files_list[*]}
		if [ $num_statistics_per_locus_files != 2 ]; then
			echo Internal Error: Expected statistics_per_locus_files_list to have 2 elements, but found $num_statistics_per_locus_files elements
		fi
		c1_statistics_per_locus_file=${statistics_per_locus_files_list[0]}
		c2_statistics_per_locus_file=${statistics_per_locus_files_list[1]}
		./create_read_counts_file_for_all_replicates.py read_counts ${read_counts_per_locus_file_1} $c1_statistics_per_locus_file
		./create_read_counts_file_for_all_replicates.py read_counts ${read_counts_per_locus_file_2} $c2_statistics_per_locus_file
		detect_DE_loci_using_Fishers_Exact_Test	
		./create_final_output_file.py ${read_counts_per_locus_file} ${read_counts_per_allele_file} ${DE_loci_summary_file} ${final_results_file}
	else
		echo Error: POPE currently does not handle multiple biological replicates per condition
		exit
	fi
	differential_expression_method_used_to_compare_conditions_file=${differential_expression_directory}/differential_expression_method_used_to_compare_${condition_1}_and_${condition_2}
	echo $differential_expression_method_used_to_compare_conditions > $differential_expression_method_used_to_compare_conditions_file
}

process_reads_for_condition () {

	## Check if we have already generated a statistics per locus file
	## for each biological replicate
	bio_rep_names_for_condition_file=${bio_rep_names_directory}/biological_replicate_names_for_${condition}

	if [ -e ${bio_rep_names_for_condition_file} ]; then
		echo Found biological replicate names for condition file
		bio_rep_names_list=( `cat ${bio_rep_names_for_condition_file}` )
		
		bool_processed_all_bio_reps=1
		for bio_rep in ${bio_rep_names_list[*]}; do
			if [ ! -e ${read_counts_per_locus_directory}/${bio_rep}_statistics_per_locus ]; then
				## did not finish processing this biological replicate
				bool_processed_all_bio_reps=
			fi
		done
	else
		# cannot determine biological replicate names
		bool_processed_all_bio_reps=
	fi

	echo DEBUG: bio reps=${bio_rep_names_list[*]}
	echo DEBUG: bool_processed_all_bio_reps=${bool_processed_all_bio_reps}

	## Stop if we have already generated a statistics per locus file
	## for each biological replicate
	if [ ! -n "$bool_processed_all_bio_reps" ]; then
		pushd ${reads_directory}
		bio_rep_names_list=( `ls` )
		popd
		echo DEBUG: bio_rep_names_list=${bio_rep_names_list[*]}
		
		if [ ${#bio_rep_names_list[*]} -eq 0 ]; then
			echo Error: found no reads files for condition $condition
			exit
		fi
        
		bio_rep_names_for_condition_string=
		for bio_rep_name in ${bio_rep_names_list[*]}; do
			## Align reads
			
			# note: TopHat depends on Bowtie and both executables should be in 'PATH'
        
			# Should I include a parameter for the options --solexa1.3-quals/--phred64-quals?
		
			# most promising options I may decide to include: -a -m -i -I -F -g
			# --closure-search --coverage-search --microexon-search --butterfly-search
			# --segment-mismatches (plus the next few consecutive options in manual)
			# --no-novel-juncs
			statistics_per_locus_output_file=${read_counts_per_locus_directory}/${bio_rep_name}_statistics_per_locus
        
			## Generate a statistics per locus file for the given biological replicate, if one does not exist
			if [ ! -e ${statistics_per_locus_output_file} ]; then

				if [ ${read_alignment_method} == ${TOPHAT} ]; then
					current_working_directory=`pwd`
					export BOWTIE_INDEXES=${current_working_directory}/bowtie_indices/${bowtie_index_basename}.ebwt/
					
					sam_hits_file=${tophat_output_directory}/${bio_rep_name}_accepted_hits.sam
					reads_file=${reads_directory}/${bio_rep_name}
					run_tophat_on_reads_file_to_generate_sam_hits_file
				elif [ ${read_alignment_method} == ${BWA} ]; then
					genomic_reference_fasta_file=${genomic_reference_fasta_file_for_bwa}
					sam_hits_file=${bwa_output_directory}/${bio_rep_name}_accepted_hits.sam
					reads_file=${reads_directory}/${bio_rep_name}
					run_bwa_on_reads_file_to_generate_sam_hits_file
				else
					echo Internal Error: Missing case for read_alignment_method \'${read_alignment_method}\'
					exit
				fi
				
				./statistics_per_locus.py ${sam_hits_file} $gene_model_annotations_GFF3_file $SNP_filename ${statistics_per_locus_output_file} $pipeline_log_file
				
				## gzip sam hits file and the "filtered alignments" files that are generated by statistics_per_locus.py
				gzip ${sam_hits_file}
				gzip ${sam_hits_file}-filtered_alignments
				gzip ${sam_hits_file}-filtered_alignments_that_map_to_a_specific_locus
				gzip ${sam_hits_file}-filtered_alignments_that_are_assigned_to_a_specific_allele
			fi
        
			bio_rep_names_for_condition_string=`echo ${bio_rep_names_for_condition_string} ${bio_rep_name}` 
		done
        
		
		## Each reads file corresponds to one biological replicate,
		## so we use the names of the reads files as the names of the biological replicates

		echo ${bio_rep_names_for_condition_string} > ${bio_rep_names_directory}/biological_replicate_names_for_${condition}
	else
		echo Found a statistics per locus file for each biological replicate of condition $condition
	fi
}

get_conditions_string_and_statistics_per_locus_files_string () {
	conditions_string=
	statistics_per_locus_files_string=
	
	for bio_rep in ${c1_bio_rep_list[*]}; do
		conditions_string=`echo ${conditions_string} ${condition_1}`
		statistics_per_locus_files_string=`echo ${statistics_per_locus_files_string} ${read_counts_per_locus_directory}/${bio_rep}_statistics_per_locus`
	done

	for bio_rep in ${c2_bio_rep_list[*]}; do
		conditions_string=`echo ${conditions_string} ${condition_2}`
		statistics_per_locus_files_string=`echo ${statistics_per_locus_files_string} ${read_counts_per_locus_directory}/${bio_rep}_statistics_per_locus`
	done
}

detect_DE_loci_using_Fishers_Exact_Test () {
	## Test for differential expression

	# note: the test takes into account upper-quartile normalization
	
	# return differential expression statistics for all loci
	pvalue_thresh=1.1
	
	R_arguments_file=DE.fishers.exact.test.arguments.file
	rm -f $R_arguments_file
	touch $R_arguments_file
	echo $read_counts_per_locus_file_1 >> $R_arguments_file
	echo $read_counts_per_locus_file_2 >> $R_arguments_file
	echo $pvalue_thresh >> $R_arguments_file
	echo $DE_loci_summary_file >> $R_arguments_file
	R CMD BATCH --vanilla DE.fishers.exact.test.R
	
}

run_mRNA_seq_pipeline
