#!/usr/bin/env python
# read_alignment_statistics_per_locus__module.py

## Designed to handle alignment files in SAM format

import sys, re
from process_gene_models_helper_functions import *
from get_SNPs_for_read__module import *
from chromosome_helper_functions__module import *

# TO CONSIDER FOR LATER ON:
# 1. How to I identify new transcripts (or new gene models, complementary to the known ones) from the TopHat output?
# 2. Should we use individual base call quality information when calling SNP ratios to determine allelic variation?


## NOTE:
##
## All mapped reads are represented on the forward genomic strand. The bases are reverse complemented from the unmapped read sequence
## and the quality scores and cigar strings are recorded consistently with the bases.
##
## POS refers to the 1-based leftmost POSition/coordinate of the clipped sequence
## - to get final read length after left-clipping, look at CIGAR string
##
## Tag:
## NM (type 'i') Number of nucleotide differences (i.e. edit distance to the reference sequence)


######################
## Global Variables ##
######################

MAX_EDIT_DISTANCE = 4
ANALYSIS_OPTION_FULL = 'full'

## The following parameter is used to check and correct for a special case where Tophat chooses an incorrect genomic alignment over a correct exon-spanning alignment
## (We do not check whether the genomic mapping is of worse quality than the candidate exon-spanning mapping.  We just
## check for an alignment with a CIGAR string of \'<nonnegative_integer>M\' where up to
## 'MAX_NUM_BASES_ALLOWED_OUTSIDE_OF_EXON_MODEL_FOR_GENOMIC_ALIGNMENTS' bases may map outside of an exon.)
MAX_NUM_BASES_ALLOWED_OUTSIDE_OF_EXON_MODEL_FOR_GENOMIC_ALIGNMENTS = 10


def get_read_statistics_per_locus_from_multiple_SAM_files(SAM_filename_list, locus_to_models_dict, model_to_filtered_exons_dict, locus_to_exons_of_selected_model_dict, \
							  locus_to_combined_length_of_non_overlapping_regions_of_selected_exon_model_dict, general_location_to_loci_dict, \
							  SNP_dict, SNP_to_refbase_dict, SNP_to_base_change_dict, analysis_option, pipeline_log_filename):
	locus_to_num_reads_dict, locus_to_num_reads_per_allele_dict = {}, {}
	total_number_of_mappable_reads = 0
	num_PCR_or_optical_duplicate_reads = 0
	tag_types_dict = {}
	for SAM_filename in SAM_filename_list:
		total_number_of_mappable_reads_for_file, locus_to_num_reads_dict_for_file, locus_to_num_reads_per_allele_dict_for_file, \
			PCR_or_optical_duplicate_read_dict_for_file, tag_types_dict_for_file = \
			get_read_statistics_per_locus_for_single_SAM_file(SAM_filename, locus_to_models_dict, model_to_filtered_exons_dict, locus_to_exons_of_selected_model_dict,
									  locus_to_combined_length_of_non_overlapping_regions_of_selected_exon_model_dict, general_location_to_loci_dict,
									  SNP_dict, SNP_to_refbase_dict, SNP_to_base_change_dict, analysis_option)
		total_number_of_mappable_reads += total_number_of_mappable_reads_for_file
		num_PCR_or_optical_duplicate_reads += len(PCR_or_optical_duplicate_read_dict_for_file.keys())
		for tag_type in tag_types_dict_for_file:
			tag_types_dict[tag_type] = 0
		for locus in locus_to_num_reads_dict_for_file:
			if not (locus in locus_to_num_reads_dict):
				locus_to_num_reads_dict[locus] = 0
			locus_to_num_reads_dict[locus] += locus_to_num_reads_dict_for_file[locus]
		for locus in locus_to_num_reads_per_allele_dict_for_file:
			if not (locus in locus_to_num_reads_per_allele_dict):
				locus_to_num_reads_per_allele_dict[locus] = {}
				locus_to_num_reads_per_allele_dict[locus]['ref'] = 0
				locus_to_num_reads_per_allele_dict[locus]['other'] = 0
			locus_to_num_reads_per_allele_dict[locus]['ref'] += locus_to_num_reads_per_allele_dict_for_file[locus]['ref']
			locus_to_num_reads_per_allele_dict[locus]['other'] += locus_to_num_reads_per_allele_dict_for_file[locus]['other']

	## Compute RPKM for each locus -- (EDIT 8/15/2011 - do not return RPKM values)
	locus_to_RPKM_dict = {}
	#for locus in locus_to_num_reads_dict:
	#	adjusted_sum_of_exon_lengths = locus_to_combined_length_of_non_overlapping_regions_of_selected_exon_model_dict[locus]
	#	
	#	if adjusted_sum_of_exon_lengths != 0:
	#		RPKM_for_locus = float(locus_to_num_reads_dict[locus] * math.pow(10, 9)) / float(adjusted_sum_of_exon_lengths) / float(total_number_of_mappable_reads)
	#		locus_to_RPKM_dict[locus] = RPKM_for_locus
	
	pipeline_log_file = open(pipeline_log_filename, 'a')
	pipeline_log_file.write('Total number of mappable reads: %d\n' % total_number_of_mappable_reads)
	pipeline_log_file.close()
	
	print ('DEBUG: Total number of PCR or optical duplicate reads across all specified SAM files: %d' % num_PCR_or_optical_duplicate_reads) + ' ' + \
		'(a given read is flagged if listed as PCR/optical duplicate in at least one alignment)'
	print 'DEBUG: List of tag types is: (%s)' % ', '.join(tag_types_dict.keys())
	return locus_to_RPKM_dict, locus_to_num_reads_dict, locus_to_num_reads_per_allele_dict

def get_read_statistics_per_locus_for_single_SAM_file(SAM_filename, locus_to_models_dict, model_to_filtered_exons_dict, locus_to_exons_of_selected_model_dict, \
							locus_to_combined_length_of_non_overlapping_regions_of_selected_exon_model_dict, general_location_to_loci_dict, \
							SNP_dict, SNP_to_refbase_dict, SNP_to_base_change_dict, analysis_option):

	## identify uniquely-mapping reads
	uniquely_mapping_read_to_best_edit_distance_dict, PCR_or_optical_duplicate_read_dict, tag_types_dict = get_uniquely_mapping_read_to_best_edit_distance_dict(SAM_filename)

	print 'DEBUG: number of uniq mapped rds (with no restrictions on edit distance of best alignment): %d' % \
		len(uniquely_mapping_read_to_best_edit_distance_dict.keys())

	## record uniquely-mapping reads
	total_number_of_mappable_reads, locus_to_num_reads_dict, locus_to_num_reads_per_allele_dict = \
				get_read_statistics_per_locus__internal(SAM_filename, uniquely_mapping_read_to_best_edit_distance_dict,
									locus_to_models_dict, model_to_filtered_exons_dict, locus_to_exons_of_selected_model_dict, 
									locus_to_combined_length_of_non_overlapping_regions_of_selected_exon_model_dict,
									general_location_to_loci_dict, SNP_dict, SNP_to_refbase_dict, SNP_to_base_change_dict, analysis_option)
	print 'DEBUG: Recorded uniquely-mapping reads'

	return total_number_of_mappable_reads, locus_to_num_reads_dict, locus_to_num_reads_per_allele_dict, PCR_or_optical_duplicate_read_dict, tag_types_dict

def get_uniquely_mapping_read_to_best_edit_distance_dict(SAM_filename):
	SAM_file = open(SAM_filename, 'r')
	read_to_best_edit_distance_dict = {}
	uniquely_mapping_read_dict = {}
	PCR_or_optical_duplicate_read_dict = {}
	tag_types_dict = {}
	line_count = 0
	for line in SAM_file:
		line_count += 1
		if (line_count % 1000000) == 0:
			print '%d,' % line_count
		line = line.rstrip('\n\r')
		if (line == '') or (line[0] == '@'):
			## empty line or header line
			continue
		query_name, chromosome, position, read_length, read_sequence, on_reverse_strand, CIGAR_string, \
			edit_distance, discard_alignment, is_PCR_or_optical_duplicate, tag_types_dict_for_single_line, query_unmapped = process_alignment_line(line)

		if does_CIGAR_string_have_non_digit_characters_besides_M_and_N(CIGAR_string):
			print 'WARNING: Discarding alignment that has a CIGAR string that contains characters besides M, N, and the digits 0-9'
			continue

		## Debugging code
		if is_PCR_or_optical_duplicate:
			PCR_or_optical_duplicate_read_dict[query_name] = 0
		for tag_type in tag_types_dict_for_single_line:
			tag_types_dict[tag_type] = 0

		if discard_alignment:
			continue
		
		## Keep alignment
		if (not (query_name in read_to_best_edit_distance_dict)) or (edit_distance < read_to_best_edit_distance_dict[query_name]):
			# new best edit distance
			read_to_best_edit_distance_dict[query_name] = edit_distance
			uniquely_mapping_read_dict[query_name] = 0
		elif edit_distance == read_to_best_edit_distance_dict[query_name]:
			# tie for best edit distance
			if query_name in uniquely_mapping_read_dict:
				del uniquely_mapping_read_dict[query_name]

	SAM_file.close()
	print 'DEBUG: finished processing SAM file in function get_uniquely_mapping_read_to_best_edit_distance_dict()'

	## modify read_to_best_edit_distance_dict to only contain uniquely mapping reads, then return the modified dictionary
	read_list = read_to_best_edit_distance_dict.keys()
	for query in read_list:
		if not (query in uniquely_mapping_read_dict):
			del read_to_best_edit_distance_dict[query]
	print 'DEBUG: modified read_to_best_edit_distance_dict to only contain uniquely mapping reads in function get_uniquely_mapping_read_to_best_edit_distance_dict()'
	return read_to_best_edit_distance_dict, PCR_or_optical_duplicate_read_dict, tag_types_dict

def get_read_statistics_per_locus__internal(SAM_filename, uniquely_mapping_read_to_best_edit_distance_dict, locus_to_models_dict, model_to_filtered_exons_dict, \
						locus_to_exons_of_selected_model_dict, locus_to_combined_length_of_non_overlapping_regions_of_selected_exon_model_dict, \
						general_location_to_loci_dict, SNP_dict, SNP_to_refbase_dict, SNP_to_base_change_dict, analysis_option):
	SAM_file = open(SAM_filename, 'r')
	locus_to_num_reads_dict, locus_to_num_reads_per_allele_dict = {}, {}
	mappable_reads_dict = {}
	num_uniquely_mapping_reads_discarded_due_to_large_edit_distance_of_best_alignment = 0
	num_uniquely_mapping_reads_that_map_to_a_specific_locus_discarded_due_to_large_edit_distance_of_best_alignment = 0
	num_uniquely_mapping_reads_assigned_to_a_specific_allele_discarded_due_to_large_edit_distance_of_best_alignment = 0
	#print 'DEBUG: %s' % ','.join(general_location_to_loci_dict.keys())	

	if analysis_option == ANALYSIS_OPTION_FULL:
		filtered_alignments_file = open(SAM_filename + "-filtered_alignments", 'w')
		filtered_alignments_for_specific_locus_file = open(SAM_filename + "-filtered_alignments_that_map_to_a_specific_locus", 'w')
		filtered_alignments_for_specific_allele_file = open(SAM_filename + "-filtered_alignments_that_are_assigned_to_a_specific_allele", 'w')

	for line in SAM_file:
		line = line.rstrip('\n\r')
		if (line == '') or (line[0] == '@'):
			## empty line or header line
			continue
		query_name, chromosome, position, read_length, read_sequence, on_reverse_strand, CIGAR_string, \
			edit_distance, discard_alignment, is_PCR_or_optical_duplicate, tag_types_dict_for_single_line, query_unmapped = process_alignment_line(line)
		if does_CIGAR_string_have_non_digit_characters_besides_M_and_N(CIGAR_string):
			print 'WARNING: Discarding alignment that has a CIGAR string that contains characters besides M, N, and the digits 0-9'
			continue

		## record mappable reads
		if not query_unmapped:
			mappable_reads_dict[query_name] = 0

		## check that the alignment is good
		if discard_alignment:
			#print 'DEBUG: alignment is not good'
			continue
		## check that the read has a unique best alignment
		if not (query_name in uniquely_mapping_read_to_best_edit_distance_dict):
			#print 'DEBUG: no unique best alignment for read'
			continue
		## check that we are looking at that best alignment
		if edit_distance != uniquely_mapping_read_to_best_edit_distance_dict[query_name]:
			#print 'DEBUG: this is not the best alignment for this read'
			continue
		# record statistics on how often the edit distance of the best alignment of a read is larger than the specified maximum
		bool_exceeds_max_edit_distance = (edit_distance > MAX_EDIT_DISTANCE)
		if bool_exceeds_max_edit_distance:
			#print 'DEBUG: alignment exceeds max edit distance'
			num_uniquely_mapping_reads_discarded_due_to_large_edit_distance_of_best_alignment += 1
		else:
			if analysis_option == ANALYSIS_OPTION_FULL:
				filtered_alignments_file.write('%s\n' % line)


		
		## associate read with a given locus
		## if it aligns to at least one gene model for that locus
		## and does not align to any gene model of any other locus

		bin_index_for_position = get_bin_index(position)
		if (not (chromosome in general_location_to_loci_dict)) or (not (bin_index_for_position in general_location_to_loci_dict[chromosome])):
			locus_list = []
		else:
			locus_list = general_location_to_loci_dict[chromosome][bin_index_for_position]

		matching_locus_list = []
		for locus in locus_list:
			read_matches_locus = False
			for model in locus_to_models_dict[locus]:
				read_matches_exon_model = does_read_match_exon_model(model_to_filtered_exons_dict[model], position, CIGAR_string)
				if read_matches_exon_model:
					read_matches_locus = True
					break
			if read_matches_locus:
				matching_locus_list.append(locus)
		if len(matching_locus_list) == 0:
			## read is not associated with any locus
			#print 'DEBUG: read is not associated with any locus'
			continue
		if len(matching_locus_list) > 1:
			## cannot determine which locus the read is mapping to
			#print 'DEBUG: cannot determine which locus the read is mapping to'
			continue
		matching_locus = matching_locus_list[0]

		## DEBUG
		#print 'DEBUG: position = %d' % position
		#print 'DEBUG: CIGAR_string = %s' % CIGAR_string
		#for exon in locus_to_exons_of_selected_model_dict[matching_locus]:
		#	print 'DEBUG: Exon: %d-%d' % (exon[0], exon[1])

		# check that the read maps to the selected exon model of the matching locus -- (EDIT 8/15/2011 - do not require read to map to a specific gene model of the matching locus)
		#read_matches_selected_exon_model_of_matching_locus = does_read_match_exon_model(locus_to_exons_of_selected_model_dict[matching_locus], position, CIGAR_string)
		#if not read_matches_selected_exon_model_of_matching_locus:
		#	# read does not map to the selected exon model of the matching locus
		#	#print 'DEBUG: read does not map to the selected exon model of the matching locus'
		#	continue

		# check that the edit distance is not larger than the specified maximum
		if bool_exceeds_max_edit_distance:
			num_uniquely_mapping_reads_that_map_to_a_specific_locus_discarded_due_to_large_edit_distance_of_best_alignment += 1
		else:
			if analysis_option == ANALYSIS_OPTION_FULL:
				filtered_alignments_for_specific_locus_file.write('%s\n' % line)
			
			## record alignment of read to matching locus
			if not (matching_locus in locus_to_num_reads_dict):
				locus_to_num_reads_dict[matching_locus] = 0
			locus_to_num_reads_dict[matching_locus] += 1


		## determine whether read can be identified as mapping to a specific allele

		# If read is from reverse strand, then TopHat automatically transforms the read to
		# get the corresponding read from the forward strand, so no further transformation is neccesary for
		# the read sequence
		num_ref_allele_SNPs_for_read, num_other_allele_SNPs_for_read, num_unexpected_SNPs_for_read = \
			get_SNP_breakdown_for_read(chromosome, position, read_length, read_sequence, CIGAR_string, SNP_dict, SNP_to_refbase_dict, SNP_to_base_change_dict)

		maps_to_ref_allele, maps_to_other_allele = False, False
		if (num_ref_allele_SNPs_for_read >= 1) and (num_other_allele_SNPs_for_read == 0) and (num_unexpected_SNPs_for_read == 0):
			maps_to_ref_allele = True
		elif (num_other_allele_SNPs_for_read >= 1) and (num_ref_allele_SNPs_for_read == 0) and (num_unexpected_SNPs_for_read == 0):
			maps_to_other_allele = True

		if maps_to_ref_allele or maps_to_other_allele:
			if bool_exceeds_max_edit_distance:
				num_uniquely_mapping_reads_assigned_to_a_specific_allele_discarded_due_to_large_edit_distance_of_best_alignment += 1
			else:
				if analysis_option == ANALYSIS_OPTION_FULL:
					filtered_alignments_for_specific_allele_file.write('%s\n' % line)
				if not (matching_locus in locus_to_num_reads_per_allele_dict):
					locus_to_num_reads_per_allele_dict[matching_locus] = {}
					locus_to_num_reads_per_allele_dict[matching_locus]['ref'] = 0
					locus_to_num_reads_per_allele_dict[matching_locus]['other'] = 0
				if maps_to_ref_allele:
					locus_to_num_reads_per_allele_dict[matching_locus]['ref'] += 1
				elif maps_to_other_allele:
					locus_to_num_reads_per_allele_dict[matching_locus]['other'] += 1
	
	SAM_file.close()
	if analysis_option == ANALYSIS_OPTION_FULL:
		filtered_alignments_file.close()
		filtered_alignments_for_specific_locus_file.close()
		filtered_alignments_for_specific_allele_file.close()

	print 'DEBUG: Number of uniquely mapping reads where the edit distance of the best alignment was larger than %d: %d' % \
		(MAX_EDIT_DISTANCE, num_uniquely_mapping_reads_discarded_due_to_large_edit_distance_of_best_alignment)
	print 'DEBUG: Number of uniquely mapping reads mapping to a specific locus where the edit distance of the best alignment was larger than %d: %d' % \
		(MAX_EDIT_DISTANCE, num_uniquely_mapping_reads_that_map_to_a_specific_locus_discarded_due_to_large_edit_distance_of_best_alignment)
	print 'DEBUG: Number of uniquely mapping reads assigned to a specific allele where the edit distance of the best alignment was larger than %d: %d' % \
		(MAX_EDIT_DISTANCE, num_uniquely_mapping_reads_assigned_to_a_specific_allele_discarded_due_to_large_edit_distance_of_best_alignment)

	## get total number of mappable reads - according to 'query_unmapped' flag in SAM file
	total_number_of_mappable_reads = len(mappable_reads_dict.keys())
	

	return total_number_of_mappable_reads, locus_to_num_reads_dict, locus_to_num_reads_per_allele_dict

def does_read_match_exon_model(exon_list, read_start_position, CIGAR_string):
	read_matches_exon_model = False
	for i in range(0, len(exon_list)):
		exon = exon_list[i]
		[exon_start, exon_stop] = exon
		## if read starts within exon
		if (exon_start <= read_start_position) and (read_start_position <= exon_stop):
			## then compare alignment against gene model
			read_matches_exon_model = compare_exon_model_against_CIGAR_string(CIGAR_string, exon_list, i, read_start_position)
		if read_matches_exon_model:
			break
	
	## if read does not match exon model exactly
	if not read_matches_exon_model:
		## then check and correct for a special case where Tophat chooses an incorrect genomic alignment over a correct exon-spanning alignment
		## (We do not check whether the genomic mapping is of worse quality than the candidate exon-spanning mapping.  We just
		## check for an alignment with a CIGAR string of \'<nonnegative_integer>M\' where up to
		## 'MAX_NUM_BASES_ALLOWED_OUTSIDE_OF_EXON_MODEL_FOR_GENOMIC_ALIGNMENTS' bases may map outside of an exon.)
		last_char_in_CIGAR_string = CIGAR_string[len(CIGAR_string) - 1]
		all_but_last_char_in_CIGAR_string = CIGAR_string[0:(len(CIGAR_string) - 1)]

		## if read has been aligned directly to the genomic sequence (CIGAR string of '<nonnegative_integer>M'
		if (last_char_in_CIGAR_string == 'M') and all_but_last_char_in_CIGAR_string.isdigit():
			num_bases_in_alignment = int(all_but_last_char_in_CIGAR_string)
			read_stop_position = read_start_position + num_bases_in_alignment - 1
			read_range = [read_start_position, read_stop_position]
			
			num_bases_overlapping_exon_model = 0
			for exon in exon_list:
				[exon_start, exon_stop] = exon
				## RETURN - get overlap btw read and exon
				num_bases_overlapping_exon_model += get_overlap_between_two_ranges(exon, read_range)

			num_bases_not_overlapping_exon_model = num_bases_in_alignment - num_bases_overlapping_exon_model
			## and the number of bases in the alignment that do not overlap the exon model falls within our threshold
			if num_bases_not_overlapping_exon_model <= MAX_NUM_BASES_ALLOWED_OUTSIDE_OF_EXON_MODEL_FOR_GENOMIC_ALIGNMENTS:
				## then record the read as aligning well enough to the exon model
				read_matches_exon_model = True
	
	return read_matches_exon_model

def process_alignment_line(line):
	discard_alignment = False

	line = line.rstrip('\n\r')
	field_list = line.split('\t')
	query_name = field_list[0]
	flag = int(field_list[1])
	chromosome = get_base_chromosome_name(field_list[2])
	position = int(field_list[3])
	CIGAR_string = field_list[5].strip()
	query_sequence = field_list[9].strip()
	tag_list = field_list[11:]

	## process tag list
	tag_types_dict = {}
	edit_distance_to_reference_sequence = 'NA'
	for tag_string in tag_list:
		[tag, vtype, value] = tag_string.split(':')
		if tag == 'NM':
			if vtype != 'i':
				raise IOError, 'Expected tag NM to have vtype=i'
			edit_distance_to_reference_sequence = int(value)
		else:
			'ignore tags not of type NM'
		tag_types_dict[tag] = 0
	if edit_distance_to_reference_sequence == 'NA':
		print 'Warning: Expected to find entry for tag NM on each alignment line of SAM file'
		discard_alignment = True

	# set limit on edit distance
	#if edit_distance_to_reference_sequence > MAX_EDIT_DISTANCE:
	#	discard_alignment = True
	
	## process flag
	bool_paired_end_read = (flag >> 0) % 2		# 0x0001
	query_unmapped = (flag >> 2) % 2		# 0x0004
	on_reverse_strand = (flag >> 4) % 2		# 0x0010
	not_primary_alignment = (flag >> 8) % 2		# 0x0100
	fails_quality_checks = (flag >> 9) % 2		# 0x0200
	is_PCR_or_optical_duplicate = (flag >> 10) % 2	# 0x0400

	if bool_paired_end_read:
		raise IOError, 'Error: script cannot currently handle paired end reads'
	if query_unmapped or not_primary_alignment or fails_quality_checks:
		## discard alignment
		discard_alignment = True
	if is_PCR_or_optical_duplicate:
		## not sure what this is - cannot find any information on it
		discard_alignment = True

	## process CIGAR string to get read length

	if ('D' in CIGAR_string) or ('I' in CIGAR_string) or ('*' in CIGAR_string) or ('H' in CIGAR_string) or ('S' in CIGAR_string) or ('P' in CIGAR_string):
		## 'H' and 'S' are associated with hard or soft-clipped reads (to handle them, one may need to adjust read length or read start position)
		print 'Warning: Script currently discards alignments with \'D\', \'I\', \'*\', \'H\', \'S\', or \'P\' in the CIGAR string, when identifying reads that align to a given gene model'
		discard_alignment = True
		read_length = 'NA'
	else:
		last_index_of_prev_element = -1
		match_length = 0
		for i in range(0, len(CIGAR_string)):
			if CIGAR_string[i] == 'M':
				match_length += int(CIGAR_string[(last_index_of_prev_element+1):i])
				last_index_of_prev_element = i
			elif CIGAR_string[i] == 'N':
				last_index_of_prev_element = i
			elif CIGAR_string[i] in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
				'do nothing'
			else:
				raise IOError, 'CIGAR_string character \'%s\' cannot currently be handled by the code' % CIGAR_string[i]
		read_length = match_length
		if match_length != len(query_sequence):
			raise IOError, 'Unexpected: match length (%d) not equal to query sequence length (%d)' % (match_length, len(query_sequence))


		## Check for specific features in CIGAR string


		
		## THE FOLLOWING COMMENTED OUT CODE MAY BE INCORRECT:
		## adjust start position of read
		#hard_clipping_left_offset = 0
		#index_of_end_of_prev_entry = -1
		#for i in range(0, len(CIGAR_string)):
		#	current_char=CIGAR_string[i]
		#	if current_char in ['M', 'I', 'D', 'N', 'S', 'H', 'P']:
		#		if not (current_char == 'H'):
		#			break
		#		if index_of_end_of_prev_entry != -1:
		#			raise IOError, 'Found two consecutive instances of hard clipping'
		#		hard_clipping_left_offset += int(CIGAR_string[(index_of_end_of_prev_entry + 1):i])
		#		index_of_end_of_prev_entry = i
		#leftmost_position_after_hard_clipping = position
		#adjusted_leftmost_position_of_read = leftmost_position_after_hard_clipping - hard_clipping_left_offset
		#position = adjusted_leftmost_position_of_read

	return query_name, chromosome, position, read_length, query_sequence, on_reverse_strand, CIGAR_string, edit_distance_to_reference_sequence, discard_alignment, is_PCR_or_optical_duplicate, tag_types_dict, query_unmapped

def compare_exon_model_against_CIGAR_string(CIGAR_string, exon_list, exon_index, start_position):
	if exon_index >= len(exon_list):
		raise IOError, 'Internal error: expected exon_index to correspond to a real exon, when passed to function compare_exon_model_against_CIGAR_string()'

	read_aligns_to_gene_model = True
	#end_index_of_first_match = CIGAR_string.find('M')
	#CIGAR_string = CIGAR_string[(end_index_of_first_match + 1):]
	CIGAR_string_original = CIGAR_string
	first_loop = True
	while True:
		
		## compare exon size to length of partial alignment of read to reference

		end_index_of_match = CIGAR_string.find('M')
		if end_index_of_match == -1:
			raise IOError, 'Expected a \'match\'(M) occurrence in fragment \'%s\' of original CIGAR_string: %s' % (CIGAR_string, CIGAR_string_original)
		match_length_string = CIGAR_string[:end_index_of_match]
		if not match_length_string.isdigit():
			raise IOError, 'Expected all characters in match_length_string would be digits, for CIGAR_string: %s' % CIGAR_string_original
		match_length = int(match_length_string)
		if match_length == 0:
			raise IOError, 'Unexpected: found match(M) occurrence of length 0'

		# get next section of CIGAR string
		CIGAR_string = CIGAR_string[(end_index_of_match + 1):]


		[exon_start, exon_stop] = exon_list[exon_index]
		if first_loop:
			remaining_length_in_exon = (exon_stop - start_position + 1)
		else:
			remaining_length_in_exon = (exon_stop - exon_start + 1)
		if match_length < remaining_length_in_exon:
			if CIGAR_string != '':
				if CIGAR_string.find('M') == -1:
					raise IOError, 'Expected a \'match\'(M) occurrence in fragment \'%s\' of original CIGAR_string: %s' % (CIGAR_string, CIGAR_string_original)
				## looks like an alternative splicing
				read_aligns_to_gene_model = False
			break
		if match_length > remaining_length_in_exon:
			## maps into intronic region
			read_aligns_to_gene_model = False
			break
		if CIGAR_string == '':
			break


		## compare intron size vs alignment gap in read

		# assert length >= 1 jump in alignment
		end_index_of_jump = CIGAR_string.find('N')
		if end_index_of_jump == -1:
			raise IOError, 'Expected a \'non-match\'(N) occurrence in fragment \'%s\' of original CIGAR_string: %s' % (CIGAR_string, CIGAR_string_original)
		skipped_region_length_string = CIGAR_string[:end_index_of_jump]
		if not skipped_region_length_string.isdigit():
			raise IOError, 'Expected all characters in skipped_region_length_string would be digits, for CIGAR_string: %s' % CIGAR_string_original
		skipped_region_length = int(skipped_region_length_string)
		if skipped_region_length == 0:
			raise IOError, 'Expected all \'non-match\'(N) occurrences would be of length at least 1, in CIGAR_string: %s' % CIGAR_string_original

		# assert length >= 1 match after jump in alignment
		new_match_index = CIGAR_string.find('M')
		if ((new_match_index == -1) or \
		    (not CIGAR_string[(end_index_of_jump + 1):new_match_index].isdigit()) or \
		    (int(CIGAR_string[(end_index_of_jump + 1):new_match_index]) == 0)):
			print 'WARNING: Expected to find positive length match after jump in alignment, in fragment \'%s\' of original CIGAR_string: %s.  Discarding alignment.' % (CIGAR_string, CIGAR_string_original)
			read_aligns_to_gene_model = False
			break

		# move to next exon
		exon_index += 1
		if exon_index == len(exon_list):
			# alignment continues past last exon of gene model
			read_aligns_to_gene_model = False
			break

		intron_length = exon_list[exon_index][0] - exon_list[exon_index - 1][1] - 1
		if skipped_region_length != intron_length:
			## alignment does not match specific gene model
			read_aligns_to_gene_model = False
			break
		CIGAR_string = CIGAR_string[(end_index_of_jump + 1):]
		

		## prepare for next iteration

		first_loop = False

	return read_aligns_to_gene_model

def get_overlap_between_two_ranges(range_A, range_B):
	overlap_start = max(range_A[0], range_B[0])
	overlap_stop = min(range_A[1], range_B[1])
	if overlap_start <= overlap_stop:
		overlap_length = overlap_stop - overlap_start + 1
	else:
		overlap_length = 0
	return overlap_length

def print_read_statistics_per_locus(locus_to_RPKM_dict, locus_to_num_reads_dict, locus_to_num_reads_per_allele_dict, locus_to_exons_of_selected_model_dict, analysis_option, output_filename):
	locus_dict = {}
	for locus in locus_to_RPKM_dict:
		locus_dict[locus] = 0
	for locus in locus_to_num_reads_dict:
		locus_dict[locus] = 0
	for locus in locus_to_num_reads_per_allele_dict:
		locus_dict[locus] = 0
	locus_list = locus_dict.keys()
	locus_list.sort()
	
	output_file = open(output_filename, 'w')
	output_file.write('locus_name\tnum_reads\tRPKM\tNumRef\tNumOth\n')
	for locus in locus_to_exons_of_selected_model_dict:
		if locus in locus_to_num_reads_dict:
			num_reads = locus_to_num_reads_dict[locus] 
		else:
			num_reads = 0

		if locus in locus_to_RPKM_dict:
			RPKM_string = '%f' % locus_to_RPKM_dict[locus]
		else:
			RPKM_string = 'NA'

		if locus in locus_to_num_reads_per_allele_dict:
			num_reads_for_ref_allele = locus_to_num_reads_per_allele_dict[locus]['ref']
			num_reads_for_other_allele = locus_to_num_reads_per_allele_dict[locus]['other']
		else:
			num_reads_for_ref_allele = 0
			num_reads_for_other_allele = 0
		if (analysis_option == ANALYSIS_OPTION_FULL) or (num_reads > 0):
			output_file.write('%s\t%d\t%s\t%d\t%d\n' % (locus, num_reads, RPKM_string, num_reads_for_ref_allele, num_reads_for_other_allele))
	output_file.close()
	return

def does_CIGAR_string_have_non_digit_characters_besides_M_and_N(CIGAR_string):
	result = False
	for i in range(0, len(CIGAR_string)):
		if not (CIGAR_string[i] in ['M', 'N', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']):
			result = True
	return result

