#!/bin/sh

## Note: should put error checking to determine whether bio rep names dir and the names files we need are there

if [ $# -ne 4 ]; then
	echo Usage: ./create_in_silico_hybrid.sh parent_A parent_B top_level_reads_directory results_directory
	exit
fi

parent_A=$1
parent_B=$2
top_level_reads_directory=$3
results_directory=$4


## Directories

bio_rep_names_dir=${results_directory}/biological_replicate_names
read_counts_per_locus_dir=${results_directory}/read_counts_per_locus
temp_dir=temp

parent_A_bio_rep_names_list=( `cat ${bio_rep_names_dir}/biological_replicate_names_for_${parent_A}` )
parent_B_bio_rep_names_list=( `cat ${bio_rep_names_dir}/biological_replicate_names_for_${parent_B}` )

num_bio_reps_for_parent_A=${#parent_A_bio_rep_names_list[*]}
num_bio_reps_for_parent_B=${#parent_B_bio_rep_names_list[*]}
if [ $num_bio_reps_for_parent_A -lt $num_bio_reps_for_parent_B ]; then
	num_bio_reps_to_use=${num_bio_reps_for_parent_A}
elif [ $num_bio_reps_for_parent_B -lt $num_bio_reps_for_parent_A ]; then
	num_bio_reps_to_use=${num_bio_reps_for_parent_B}
else
	## both are the same
	num_bio_reps_to_use=${num_bio_reps_for_parent_A}
fi

## for each chosen pair of biological replicates (one from parent A, one from parent B),
## combine the data from their statistics per locus files to make a statistics per locus file
## for an invented "biological replicate" of the in silico hybrid
i=0
in_silico_hybrid_bio_rep_names_string=
while [ $i -lt ${num_bio_reps_to_use} ]; do
	parent_A_bio_rep=${parent_A_bio_rep_names_list[i]}
	parent_B_bio_rep=${parent_B_bio_rep_names_list[i]}
	
	parent_A_statistics_per_locus_file=${read_counts_per_locus_dir}/${parent_A_bio_rep}_statistics_per_locus
	parent_B_statistics_per_locus_file=${read_counts_per_locus_dir}/${parent_B_bio_rep}_statistics_per_locus

	## Normalize the two statistics per locus files so that they have the same 75th percentile read count
	parent_A_statistics_per_locus_file_after_normalization=${temp_dir}/${parent_A_bio_rep}_statistics_per_locus_after_normalization
	parent_B_statistics_per_locus_file_after_normalization=${temp_dir}/${parent_B_bio_rep}_statistics_per_locus_after_normalization
	R_arguments_file=upper.quartile.normalize.two.statistics.per.locus.files.arguments.file
	rm -f $R_arguments_file
	touch $R_arguments_file
	echo $parent_A_statistics_per_locus_file >> $R_arguments_file
	echo $parent_B_statistics_per_locus_file >> $R_arguments_file
	echo $parent_A_statistics_per_locus_file_after_normalization >> $R_arguments_file
	echo $parent_B_statistics_per_locus_file_after_normalization >> $R_arguments_file
	R CMD BATCH --vanilla upper.quartile.normalize.two.statistics.per.locus.files.R
	
	## Join the two normalized statistics per locus files to get the in-silico hybrid
	in_silico_hybrid_bio_rep_number=$(( i + 1 ))
	in_silico_hybrid_bio_rep=in_silico_hybrid_${parent_A}_${parent_B}_${in_silico_hybrid_bio_rep_number}
	in_silico_hybrid_statistics_per_locus_file=${read_counts_per_locus_dir}/${in_silico_hybrid_bio_rep}_statistics_per_locus
	./add_results_from_two_statistics_per_locus_files.py ${parent_A_statistics_per_locus_file_after_normalization} ${parent_B_statistics_per_locus_file_after_normalization} ${in_silico_hybrid_statistics_per_locus_file}

	in_silico_hybrid_bio_rep_names_string=`echo ${in_silico_hybrid_bio_rep_names_string} ${in_silico_hybrid_bio_rep}`
	i=$(( i + 1 ))
done

## write the names of the "biological replicates" generated for the in-silico hybrid to a file
echo $in_silico_hybrid_bio_rep_names_string > ${bio_rep_names_dir}/biological_replicate_names_for_in_silico_hybrid_${parent_A}_${parent_B}

