#!/usr/bin/env python
import sys
import random

if len(sys.argv) != 5:
	raise IOError, 'Usage: ./subset_pileup.out.py pileup.out output_pileup.out ["fraction" subset_fraction | "cap" subset_cap]'

def subset_pileup_file(pileup_filename, output_filename, subset_option, subset_factor_string):
	pileup_file = open(pileup_filename, 'r')
	output_file = open(output_filename, 'w')
	counter = 0
	for line in pileup_file:
		if (counter % 1000) == 0:
			print str(counter) + ','
		subset_pileup_line(line, output_file, subset_option, subset_factor_string)
		counter += 1
	output_file.close()
	pileup_file.close()

def subset_pileup_line(line, output_file, subset_option, subset_factor_string):
	line = line.rstrip('\n\r')
	field_list = line.split('\t')

	coverage = int(field_list[3])
	reads = (field_list[4])[1:]
	qualities = (field_list[5])[1:]
	last_field_values = (field_list[6])[1:]

	subset_mask = get_subset_mask(coverage, subset_option, subset_factor_string)

	subset_coverage = get_subset_coverage(subset_mask)
	subset_reads = apply_subset_mask(reads, subset_mask)
	subset_qualities = apply_subset_mask(qualities, subset_mask)
	subset_last_field_values = apply_subset_mask(last_field_values, subset_mask)

	subset_line = field_list[0] + '\t' + field_list[1] + '\t' + field_list[2] + '\t'
	subset_line = subset_line + str(subset_coverage) + '\t@' + subset_reads + '\t@' + subset_qualities + '\t@' + subset_last_field_values

	#print subset_line
	output_file.write(subset_line + '\n')

def get_subset_mask(coverage, subset_option, subset_factor_string):
	if subset_option == "fraction":
		subset_fraction = float(subset_factor_string)
		subset_mask = get_subset_mask_for_fraction(coverage, subset_fraction)
	elif subset_option == "cap":
		subset_cap = int(subset_factor_string)
		subset_mask = get_subset_mask_for_cap(coverage, subset_cap)
	else:
		raise IOError, 'Error: invalid subset option'
	return subset_mask

def get_subset_mask_for_fraction(coverage, subset_fraction):
	#subset_coverage = int(round(subset_fraction * coverage))
	#subset_mask = get_subset_mask_for_cap(coverage, subset_coverage)
	subset_mask = {}
	for i in range(0, coverage):
		random_probability = random.random()
		if random_probability < subset_fraction:
			subset_mask[i] = True
		else:
			subset_mask[i] = False
	return subset_mask

def get_subset_mask_for_cap(coverage, subset_cap):
	subset_mask = {}
	if coverage <= subset_cap:
		for i in range(0, coverage):
			subset_mask[i] = True
	else:
		#print "Before get_subset_indices()"
		subset_indices = get_subset_indices(coverage, subset_cap)
		#print "After get_subset_indices()"
		for i in range(0, coverage):
			if i in subset_indices:
				subset_mask[i] = True
			else:
				subset_mask[i] = False
	#print "End of get_subset_mask_for_cap()"
	return subset_mask

def get_subset_indices(coverage, subset_cap):
	remaining_indices = range(0, coverage)
	subset_indices = []
	for i in range(0, subset_cap):
		index_in_remaining_list = random.randrange(0, len(remaining_indices))
		new_index = remaining_indices[index_in_remaining_list]
		remaining_indices.remove(new_index)
		subset_indices.append(new_index)
	return subset_indices

def get_subset_coverage(subset_mask):
	subset_coverage = 0
	for i in range(0, len(subset_mask.keys())):
		if subset_mask[i] == True:
			subset_coverage = subset_coverage + 1
	return subset_coverage

def apply_subset_mask(string, subset_mask):
	if len(string) != len(subset_mask.keys()):
		raise IOError, 'Subset mask must be the same length as the string it is subsetting'
	
	subset_of_string = ""
	if string != "":
		for i in range(0, len(string)):
			if subset_mask[i] == True:
				subset_of_string = subset_of_string + string[i]
	return subset_of_string

subset_pileup_file(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
