#!/usr/bin/env python
import sys
if len(sys.argv) != 3:
	raise IOError, 'Usage: ./split_sam_hits_file.py sam_hits_filename output_directory'

def main(sam_hits_filename, output_directory_string):
	header_line_list = []
	sam_hits_file = open(sam_hits_filename, 'r')
	for line in sam_hits_file:
		line = line.rstrip('\r\n')
		if line[0] == '@':
			## header line
			header_line_list.append(line)
	sam_hits_file.close()

	
	sequence_name_dict = {}
	sam_hits_file = open(sam_hits_filename, 'r')
	previous_sequence_name = -1
	for line in sam_hits_file:
		line = line.rstrip('\r\n')
		if line[0] == '@':
			## header line
			continue
		field_list = line.split('\t')
		read_id_string = field_list[0]
		id_list = read_id_string.split('_')
		if len(id_list) < 3:
			raise IOError, 'Unexpected - first tab-delimited field has less than two underscores'
		sequence_name = '_'.join(id_list[1:(len(id_list) - 1)])
		if sequence_name != previous_sequence_name:
			## close sam file
			if previous_sequence_name != -1:
				output_file.close()
			## open sam file
			output_filename = ('%s/accepted_hits_for_%s.sam' % (output_directory_string, sequence_name))

			if not (sequence_name in sequence_name_dict):
				output_file = open(output_filename, 'w')
				## write header lines to sam file
				for header_line in header_line_list:
					output_file.write('%s\n' % header_line)
				## add sequence name to dictionary
				sequence_name_dict[sequence_name] = 0
			else:
				output_file = open(output_filename, 'a')

		output_file.write('%s\n' % line)
		
		## update variable
		previous_sequence_name = sequence_name

	## close sam file
	if previous_sequence_name != -1:
		output_file.close()

	return

main(sys.argv[1], sys.argv[2])
