## Note: this code can be modified to search a specific subset of the (MSU japonica rice) genome

base_list = ['A', 'C', 'G', 'T']

def get_SNPs_from_file(SNP_filename):
	SNP_file = open(SNP_filename, 'r')
	SNP_dict = {}
	for line in SNP_file:
		line = line.rstrip('\r\n')
		field_list = line.split('\t')
		if len(field_list) != 4:
			raise IOError, 'Expected each line to have 4 fields representing a single SNP'
		chr = field_list[0]
		position_string = field_list[1]
		refbase = field_list[2]
		SNPbase = field_list[3]

		position_string = position_string.lower()
		if (position_string == 'pos') or (position_string == 'position'):
			## header line
			continue

		pos = int(position_string)

		#if is_MSU_BGI_conserved_location(chr, pos):
		#if not ((refbase in base_list) and (SNPbase in base_list)):
		if not (SNPbase in base_list):
			raise IOError, 'SNPbase(%s) should be one of %s' % (SNPbase, ','.join(base_list))
		SNP = '%s\t%d\t%s\t%s' % (chr, pos, refbase, SNPbase)
		if SNP in SNP_dict:
			raise IOError, 'Saw the same SNP on two different lines'
		SNP_dict[SNP] = 0
	return SNP_dict

## this is pretty approximate - obtained from eyeballing the locations of the SNPs
## present in both my 93-11 SNP predictions and the Lu et al. 2010 93-11 homozygous SNP set
def is_MSU_BGI_conserved_location(chr, pos):
	if (((chr == 'chr01|13101') and (pos <= 5373636)) or \
	    ((chr == 'chr02|13102') and (pos <= 5966345)) or \
	    ((chr == 'chr06|13106') and (pos >= 135970) and (pos <= 10562603)) or \
	    ((chr == 'chr08|13108') and (pos <= 11141915)) or \
	    ((chr == 'chr12|13112') and (pos <= 991161))):
		result = True
	else:
		result = False
	return result

