#!/usr/bin/env python
# get_prototype_recognition_sequences.py

import sys

nueleic_acid_code_symbols = ['A', 'C', 'T', 'G', 'M', 'R', 'W', 'S', 'Y', 'K', 'V', 'H', 'D', 'B', 'N']

def get_offsets_for_cuts_listed_in_parentheses(recognition_sequence_string):
	# check for explicitly listed cut on LHS
	if (recognition_sequence_string[0] == '('):
		end_parenthesis_index = recognition_sequence_string.find(')')
		lhs_cut_string = recognition_sequence_string[0:end_parenthesis_index]
		recognition_sequence_string = recognition_sequence_string[(end_parenthesis_index + 1):]
		forward_strand_cut_offset_on_lhs = int(lhs_cut_string[1:].split('/')[0])
	else:
		forward_strand_cut_offset_on_lhs = 'NA'

	# check for explicitly listed cut on RHS
	if (recognition_sequence_string[len(recognition_sequence_string) - 1] == ')'):
		open_parenthesis_index = recognition_sequence_string.find('(')
		rhs_cut_string = recognition_sequence_string[(open_parenthesis_index + 1):]
		recognition_sequence_string = recognition_sequence_string[0:open_parenthesis_index]
		forward_strand_cut_offset_on_rhs = int(rhs_cut_string.split('/')[0])
	else:
		forward_strand_cut_offset_on_rhs = 'NA'
	
	return recognition_sequence_string, forward_strand_cut_offset_on_lhs, forward_strand_cut_offset_on_rhs

def parse_recognition_sequence(unparsed_recognition_sequence_string):
	cut_offset_list = []
	recognition_sequence_string_with_no_parentheses, forward_strand_cut_offset_on_lhs, forward_strand_cut_offset_on_rhs \
			= get_offsets_for_cuts_listed_in_parentheses(unparsed_recognition_sequence_string)

	# record offset of cut represented by the '^' character
	# and determine the recognition sequence
	found_carat_character = False
	recognition_sequence = ""
	for i in range(0, len(recognition_sequence_string_with_no_parentheses)):
		if recognition_sequence_string_with_no_parentheses[i] in nueleic_acid_code_symbols:
			recognition_sequence += recognition_sequence_string_with_no_parentheses[i]
		elif recognition_sequence_string_with_no_parentheses[i] == '^':
			if found_carat_character:
				raise IOError, 'Found two \'^\' characters in the same recognition sequence'
			cut_offset_list.append(len(recognition_sequence))
			found_carat_character = True
	
	# record offsets of LHS and RHS cuts that were listed in parentheses
	if forward_strand_cut_offset_on_lhs != 'NA':
		cut_offset_list.append(-forward_strand_cut_offset_on_lhs)
	if forward_strand_cut_offset_on_rhs != 'NA':
		cut_offset_list.append(len(recognition_sequence) + forward_strand_cut_offset_on_rhs)

	return recognition_sequence, cut_offset_list

rebase_recognition_site_filename = "type_2_restriction_enzyme_recognition_sites_from_REBASE"

rebase_recognition_site_file = open(rebase_recognition_site_filename, 'r')

# skip first 10 lines
for i in range(0, 10):
	rebase_recognition_site_file.readline()

for line in rebase_recognition_site_file:
	line = line.rstrip('\n\r')
	if line == "":
		continue
	field_list = line.split('\t')
	enzyme = field_list[0]
	prototype = field_list[1]
	recognition_sequence_unparsed = field_list[2].strip()
	if prototype != "":
		continue
	# enzyme must be a prototype
	recognition_sequence, cut_offset_list = parse_recognition_sequence(recognition_sequence_unparsed)
	sys.stdout.write(enzyme + '\t' + recognition_sequence + '\t')
	if cut_offset_list == []:
		sys.stdout.write('NA')
	else:
		sys.stdout.write(str(cut_offset_list[0]))
		for cut_offset in cut_offset_list[1:]:
			sys.stdout.write(',' + str(cut_offset))
	sys.stdout.write('\n')

