#!/bin/sh
# POPE: Pipeline of Parentally-biased Expression
# by Victor Missirian


## Notes on things to add to README:
# 1. record all changes to DE pipeline module
# 2. read file names must be unique across all conditions
# 3. condition names must not contain any instance of an underscore ('_')
# 4. reads directory for a given parent must be in 'top_level_reads_directory'
#    and its name must match the name for the parent provided to the pipeline
# 5. reads directory for a reciprocal hybrid of two parents must have a name of the form
#    real_hybrid_<parent_1>_<parent_2>
###### MORE RECENT ##########
# 6. reference/other allele issue - describe how I determine which parent is listed as the reference
# 7. provided SNP file, versus 'NA'

if [ $# -ne 12 ]; then
	echo Usage: ./reciprocal_hybrid_pipeline.sh ref_parent other_parent top_level_reads_directory read_alignment_method bowtie_index_basename genomic_reference_fasta_file_for_bwa gene_model_annotations_GFF3_file SNP_filename cutoff_for_filtering_SNPs_that_are_just_outside_exon_boundaries negative_SNP_call_coverage_threshold pvalue_threshold reads_processing_output_directory
	exit
fi

ref_parent=$1
other_parent=$2
top_level_reads_directory=$3
read_alignment_method=$4
bowtie_index_basename=$5
genomic_reference_fasta_file_for_bwa=$6
gene_model_annotations_GFF3_file=$7
SNP_filename=$8
cutoff_for_filtering_SNPs_that_are_just_outside_exon_boundaries=${9}
negative_SNP_call_coverage_threshold=${10}
pvalue_threshold=${11}
reads_processing_output_directory=${12}


## Directories

SNP_output_directory=${reads_processing_output_directory}/SNP
read_counts_per_locus_directory=${reads_processing_output_directory}/read_counts_per_locus
differential_expression_directory=${reads_processing_output_directory}/differential_expression

source pipeline_helper_functions.sh

run_reciprocal_hybrid_pipeline () {
	check_if_specified_read_alignment_method_is_valid

	## Create required directories	
	directory_list=( ${differential_expression_directory} ${SNP_output_directory} ${read_count_frequency_distribution_directory} )
	create_directories_in_directory_list
	
	## If a SNP file was not provided
	if [ $SNP_filename == NA ]; then
		if [ ${cutoff_for_filtering_SNPs_that_are_just_outside_exon_boundaries} == NA ]; then
			echo Warning: cutoff_for_filtering_SNPs_that_are_just_outside_exon_boundaries was not specified, so it is being set to the default value of 10
			cutoff_for_filtering_SNPs_that_are_just_outside_exon_boundaries=10
		fi
		if [ ${negative_SNP_call_coverage_threshold} == NA ]; then
			echo Warning: negative_SNP_call_coverage_threshold was not specified, so it is being set to the default value of 20
			negative_SNP_call_coverage_threshold=20
		fi
		## then generate one using the provided reads data
		SNP_filename=${SNP_output_directory}/predicted_SNPs_between_${reference_parent}_and_${other_parent}__final.snp
		./generate_SNP_file_from_reads_data.sh $ref_parent $other_parent $reference_genotype $top_level_reads_directory $read_alignment_method $bowtie_index_basename $genomic_reference_fasta_file_for_bwa $gene_model_annotations_GFF3_file $SNP_filename $reads_processing_output_directory $cutoff_for_filtering_SNPs_that_are_just_outside_exon_boundaries $negative_SNP_call_coverage_threshold
	fi

	## Report the SNP frequency distribution across loci
	SNP_frequency_distribution_output_file=${SNP_output_directory}/frequency_distribution_across_loci_for_SNPs_between_${reference_parent}_and_${other_parent}
	./get_SNPs_per_locus.py $SNP_filename $gene_model_annotations_GFF3_file $SNP_frequency_distribution_output_file
	
	## test for high-parent, low-parent, overdominant, or underdominant expression in hybrid, and
	## test for differential expression between parents and between reciprocal hybrids
	
	hybrid_RO=real_hybrid_${reference_parent}_${other_parent}
	hybrid_OR=real_hybrid_${other_parent}_${reference_parent}
	
	condition_1_list=( $reference_parent $hybrid_RO $hybrid_RO $hybrid_RO $hybrid_OR $hybrid_OR )
	condition_2_list=( $other_parent $hybrid_OR $reference_parent  $other_parent  $reference_parent  $other_parent )
	i=0
	while [ $i -lt ${#condition_1_list[*]} ]; do
		condition_1=${condition_1_list[i]}
		condition_2=${condition_2_list[i]}
		run_DE_pipeline_on_two_conditions
		i=$(( i + 1 ))
	done

	## create in silico hybrid

	echo Creating in silico hybrid

	./create_in_silico_hybrid.sh $reference_parent $other_parent $top_level_reads_directory $reads_processing_output_directory

	## test for non-additive expression in hybrid

	in_silico_hybrid=in_silico_hybrid_${reference_parent}_${other_parent}
	condition_2=${in_silico_hybrid}

	condition_1=${hybrid_RO}
	run_DE_pipeline_on_two_conditions

	condition_1=${hybrid_OR}
	run_DE_pipeline_on_two_conditions

	## Record the read count frequency distribution across genes for each condition
	RCFD_output_file=${read_count_frequency_distribution_directory}/read_count_frequency_distribution_across_loci_for_parents_${reference_parent}_and_${other_parent}_and_their_reciprocal_and_in_silico_hybrids
	allele_specific_RCFD_output_file=${read_count_frequency_distribution_directory}/allele_specific_read_count_frequency_distribution_across_loci_for_parents_${reference_parent}_and_${other_parent}_and_their_reciprocal_and_in_silico_hybrids
	./read_count_frequency_distribution.sh ${read_counts_per_locus_directory} ${RCFD_output_file} ${allele_specific_RCFD_output_file} $reference_parent $other_parent ${hybrid_RO} ${hybrid_OR} ${in_silico_hybrid} 
}

run_DE_pipeline_on_two_conditions () {
	echo Comparing condition $condition_1 to condition $condition_2
	./mRNA-seq_pipeline.sh $condition_1 $condition_2 ${top_level_reads_directory} $read_alignment_method $bowtie_index_basename $genomic_reference_fasta_file_for_bwa $gene_model_annotations_GFF3_file $SNP_filename $reads_processing_output_directory
}

run_reciprocal_hybrid_pipeline
