#!/bin/sh

if [ $# -ne 12 ]; then
	echo Usage: ./generate_SNP_file_from_reads_data.sh parent_A parent_B reference_genotype top_level_reads_directory read_alignment_method bowtie_index_basename genomic_reference_fasta_file_for_bwa gene_model_annotations_GFF3_file SNP_filename results_directory cutoff_for_filtering_SNPs_that_are_just_outside_exon_boundaries negative_call_coverage_threshold
	exit
fi

parent_A=$1
parent_B=$2
reference_genotype=$3
top_level_reads_directory=$4
read_alignment_method=$5
bowtie_index_basename=$6
genomic_reference_fasta_file_for_bwa=$7
gene_model_annotations_GFF3_file=$8
final_SNP_file=$9
results_directory=${10}
cutoff_for_filtering_SNPs_that_are_just_outside_exon_boundaries=${11}
negative_call_coverage_threshold=${12}


## Directories

tophat_output_directory=${results_directory}/tophat
bwa_output_directory=${results_directory}/bwa
SNP_output_directory=${results_directory}/SNP
temp_directory=temp

source pipeline_helper_functions.sh


## Set value of variable 'ref_file'
if [ ${read_alignment_method} == ${TOPHAT} ]; then
	## if reference file does not exist, then generate it
	ref_file=bowtie_indices/${bowtie_index_basename}.ebwt/${bowtie_index_basename}.fa
	if [ ! -e ${ref_file} ]; then
		pushd ${bowtie_index_basename}.ebwt
		bowtie-inspect ${bowtie_index_basename} > ../${ref_file}
		popd
	fi
elif [ ${read_alignment_method} == ${BWA} ]; then
	## since we are using BWA, the variable 'genomic_reference_fasta_file_for_bwa'
	## should be the name of a fasta file for the genomic reference
	ref_file=${genomic_reference_fasta_file_for_bwa}
	if [ ! -e ${ref_file} ]; then
		echo Internal Error: Could not find genomic reference fasta file ${genomic_reference_fasta_file_for_bwa}
		exit
	fi
else
	echo Internal Error: Missing case for read_alignment_method \'${read_alignment_method}\'
	exit
fi

generate_SNP_file_from_reads_data () {
	if [ $parent_A == $parent_B ]; then
		echo parent_A and parent_B must be different from each other
		exit
	fi
	directory_list=( $tophat_output_directory ${bwa_output_directory} $SNP_output_directory $temp_directory )
	create_directories_in_directory_list

	assign_parent_A_and_parent_B_to_reference_parent_and_other_parent_taking_into_account_reference_genotype
	
	reads_directory_reference_parent=${top_level_reads_directory}/${reference_parent}
	reads_directory_other_parent=${top_level_reads_directory}/${other_parent}

	reads_directory=${reads_directory_reference_parent}
	condition=${reference_parent}
	get_positive_and_negative_SNP_calls_for_condition
	reference_parent_SNP_file_with_positive_calls=${SNP_file_with_positive_calls}
	reference_parent_SNP_file_with_negative_calls=${SNP_file_with_negative_calls}

	reads_directory=${reads_directory_other_parent}
	condition=${other_parent}
	get_positive_and_negative_SNP_calls_for_condition
	other_parent_SNP_file_with_positive_calls=${SNP_file_with_positive_calls}
	other_parent_SNP_file_with_negative_calls=${SNP_file_with_negative_calls}

	final_SNP_file_including_exon_boundary_SNPs=${SNP_output_directory}/predicted_SNPs_between_${reference_parent}_and_${other_parent}__including_SNPs_that_are_just_outside_exon_boundaries.snp
	final_SNP_file_excluding_exon_boundary_SNPs=${SNP_output_directory}/predicted_SNPs_between_${reference_parent}_and_${other_parent}__excluding_SNPs_that_are_just_outside_exon_boundaries.snp
	if [ $parent_A == $reference_genotype ] || [ $parent_B == $reference_genotype ]; then
		./get_difference_between_SNP_files.py ${other_parent_SNP_file_with_positive_calls} ${reference_parent_SNP_file_with_positive_calls} s > $final_SNP_file_including_exon_boundary_SNPs
	else
		./get_SNPs_between_the_two_other_alleles_from_two_SNP_files.py ${reference_parent_SNP_file_with_negative_calls} ${other_parent_SNP_file_with_negative_calls} $final_SNP_file_including_exon_boundary_SNPs
	fi
	./filter_out_SNPs_that_are_just_outside_exon_boundaries.py ${final_SNP_file_including_exon_boundary_SNPs} ${gene_model_annotations_GFF3_file} ${cutoff_for_filtering_SNPs_that_are_just_outside_exon_boundaries} ${final_SNP_file_excluding_exon_boundary_SNPs}
	cp -p ${final_SNP_file_excluding_exon_boundary_SNPs} ${final_SNP_file}
}	

get_positive_and_negative_SNP_calls_for_condition () {
	output_file_basename=${SNP_output_directory}/predicted_SNPs_for_${condition}

	## set values of variables 'hits_file_basename' and 'bam_hits_file'
	if [ ${read_alignment_method} == ${TOPHAT} ]; then
		hits_file_basename=${tophat_output_directory}/${condition}_accepted_hits
		bam_hits_file=${hits_file_basename}.bam
	elif [ ${read_alignment_method} == ${BWA} ]; then
		hits_file_basename=${bwa_output_directory}/${condition}_accepted_hits
		bam_hits_file=${hits_file_basename}.bam
	else
		echo Internal Error: Missing case for read_alignment_method \'${read_alignment_method}\'
		exit
	fi

	
	SNP_file_with_positive_calls=${output_file_basename}.snp
	if [ ! -e ${SNP_file_with_positive_calls} ]; then
		get_positive_SNP_calls_for_condition
	fi


	## get sorted bam file
	echo DEBUG: Computing sorted bam file
	sorted_bam_file_base=${hits_file_basename}_sorted
	samtools sort ${bam_hits_file} ${sorted_bam_file_base}
	sorted_bam_file=${sorted_bam_file_base}.bam

	pileup_file=${output_file_basename}.pileup
	if [ -e ${pileup_file}.gz ]; then
		if [ -e ${pileup_file} ]; then
			echo Internal Error: found both a pileup file and a gzipped pileup file of the same name,
			echo in function \'get_positive_and_negative_SNP_calls_for_condition\' of \'generate_SNP_file_from_reads_data.sh\'.
			exit
		fi
		gunzip ${pileup_file}.gz
	fi
	if [ ! -e ${pileup_file} ]; then
		## generate pileup file
		samtools faidx ${ref_file}
		samtools mpileup -f ${ref_file} ${sorted_bam_file} > ${pileup_file}
	fi

	SNP_file_with_negative_calls=${output_file_basename}_with_negative_calls.snp
	./get_SNP_file_with_negative_calls.py ${SNP_file_with_positive_calls} ${pileup_file} $negative_call_coverage_threshold $SNP_file_with_negative_calls
	gzip ${pileup_file}
}

get_positive_SNP_calls_for_condition () {
	pushd $reads_directory
	bio_rep_names_list=( `ls` )
	popd
	echo DEBUG: bio_rep_names_list=${bio_rep_names_list[*]}

	if [ ${#bio_rep_names_list[*]} -eq 0 ]; then
		echo Error: found no reads files for condition $condition
		exit
	fi
	
	combined_reads_file=${temp_directory}/_combined_reads_file_${condition}
	touch $combined_reads_file
	for bio_rep_name in ${bio_rep_names_list[*]}; do
		cat ${reads_directory}/${bio_rep_name} >> $combined_reads_file
	done

	## Align reads
	if [ ${read_alignment_method} == ${TOPHAT} ]; then
		# note: TopHat depends on Bowtie and both executables should be in 'PATH'
		current_working_directory=`pwd`
		export BOWTIE_INDEXES=${current_working_directory}/bowtie_indices/${bowtie_index_basename}.ebwt/

		reads_file=${combined_reads_file}
		run_tophat_on_reads_file_to_generate_bam_hits_file
		gzip ${reads_file} 
	elif [ ${read_alignment_method} == ${BWA} ]; then
		## bwa executable should be in 'PATH'
		genomic_reference_fasta_file=${genomic_reference_fasta_file_for_bwa}
		reads_file=${combined_reads_file}
		run_bwa_on_reads_file_to_generate_bam_hits_file
		gzip ${reads_file}	
	else
		echo Internal Error: Missing case for read_alignment_method \'${read_alignment_method}\'
		exit
	fi

	## predict SNPs using samtools
	vcf_file=${output_file_basename}.flt.vcf
	./samtools2SNP.sh ${ref_file} ${bam_hits_file} ${output_file_basename}
	
	./vcf2SNP.py ${vcf_file} ${SNP_file_with_positive_calls}
}

generate_SNP_file_from_reads_data
