# read.count.frequency.distribution.R

source("read.count.frequency.distribution.helper.functions.R")

"read.count.frequency.distribution" <- function(RCFD.output.file, allele.specific.RCFD.output.file, comma.delimited.condition.and.associated.statistics.per.locus.files.array){
	#strsplit('hello,world,bye', ',')[[1]]

	condition.array <- c()
	condition.and.locus.to.num.reads.hashtable <- hash()
	condition.and.locus.to.num.ref.hashtable <- hash()
	condition.and.locus.to.num.other.hashtable <- hash()
	first.iteration.of.loop.storing.num.reads.per.locus.for.each.condition <- TRUE
	for(comma.delimited.condition.and.associated.statistics.per.locus.files.string in comma.delimited.condition.and.associated.statistics.per.locus.files.array){
		array.of.condition.and.files <- strsplit(comma.delimited.condition.and.associated.statistics.per.locus.files.string, ',')[[1]]
		length.of.array.of.condition.and.files <- length(array.of.condition.and.files)
		if(length.of.array.of.condition.and.files == 0){
			stop("Internal Error in read.count.frequency.distribution(): This should not happen")
		}
		
		condition <- array.of.condition.and.files[1]
		if(length(array.of.condition.and.files) < 2){
			stop(sprintf("Could not find at least one statistics per locus file for condition %s", condition))
		}
		condition.array <- c(condition.array, condition)
		
		statistics.per.locus.file.array <- array.of.condition.and.files[2:length.of.array.of.condition.and.files]
		data <- get.regular.and.allele.specific.counts.from.set.of.statistics.per.locus.files(statistics.per.locus.file.array)
		condition.and.locus.to.num.reads.hashtable[[condition]] <- data$locus.to.num.reads.hashtable
		condition.and.locus.to.num.ref.hashtable[[condition]] <- data$locus.to.num.ref.hashtable
		condition.and.locus.to.num.other.hashtable[[condition]] <- data$locus.to.num.other.hashtable
		if(first.iteration.of.loop.storing.num.reads.per.locus.for.each.condition){
			sorted.locus.array <- data$sorted.locus.array	
		}else{
			## Error checking: make sure that sorted locus array for current iteration matches the one for the first iteration
			current.sorted.locus.array <- data$sorted.locus.array
			if(length(current.sorted.locus.array) != length(sorted.locus.array)){
				stop("Locus lists are different between two or more statistics per locus files (1)")
			}else{
				for(i in 1:length(current.sorted.locus.array)){
					if(current.sorted.locus.array[i] != sorted.locus.array[i]){
						stop("Locus lists are different between two or more statistics per locus files (2)")
					}
				}
			}
		}
		first.iteration.of.loop.storing.num.reads.per.locus.for.each.condition <- FALSE
	}

	## initialize output files
	read.count.frequency.distribution.output.file <- 'read.count.frequency.distribution.output.file'
	allele.specific.read.count.frequency.distribution.output.file <- 'allele.specific.read.count.frequency.distribution.output.file'

	tab.delimited.condition.string <- ''
	for(condition in condition.array){
		if(tab.delimited.condition.string != ''){
			tab.delimited.condition.string <- sprintf('%s\t', tab.delimited.condition.string)
		}
		tab.delimited.condition.string <- sprintf('%s%s', tab.delimited.condition.string, condition)
	}
	header.line <- sprintf("read_ct\t%s\n", tab.delimited.condition.string)
	
	cat(header.line, file=read.count.frequency.distribution.output.file, append=FALSE)
	cat(header.line, file=allele.specific.read.count.frequency.distribution.output.file, append=FALSE)


	#cat(sprintf('There are %.0f loci total\n', length(sorted.locus.array)), file=read.count.frequency.distribution.output.file, append=TRUE)
	num.reads.threshold.list <- c(0, 1, 2, 5, 10, 20, 50, 100, 200, 500)
	for(i in 1:(length(num.reads.threshold.list) + 1)){
		if(i == (length(num.reads.threshold.list) + 1)){
			range.max <- Inf
		}else{
			range.max <- num.reads.threshold.list[i]
		}
		if(i == 1){
			range.min <- num.reads.threshold.list[i]
		}else{
			range.min <- num.reads.threshold.list[i-1] + 1
		}
		
		if(range.max != Inf){
			range.max.string <- sprintf('%.0f', range.max)
		}else{
			range.max.string <- "Inf"
		}
		range.string <- sprintf('%.0f-%s', range.min, range.max.string)
		read.count.frequency.distribution.output.line <- range.string
		allele.specific.read.count.frequency.distribution.output.line <- range.string
		
		for(condition in condition.array){
			reads.count <- 0
			ref.count <- 0
			other.count <- 0
			allele.specific.count <- 0
			for(locus in sorted.locus.array){
				if(length(condition.and.locus.to.num.reads.hashtable[[condition]]) == 0){
					stop("Internal Error: found condition that does not have an entry in condition.and.locus.to.num.reads.hashtable")
				}else if(length(condition.and.locus.to.num.reads.hashtable[[condition]][[locus]]) == 0){
					stop("Internal Error: found locus and condition such that locus does not have an entry in condition.and.locus.to.num.reads.hashtable[[condition]]")
				}
				num.reads <- condition.and.locus.to.num.reads.hashtable[[condition]][[locus]]
				num.ref <- condition.and.locus.to.num.ref.hashtable[[condition]][[locus]]
				num.other <- condition.and.locus.to.num.other.hashtable[[condition]][[locus]]
				num.allele.specific <- num.ref + num.other
				if((range.min <= num.reads) && (num.reads <= range.max)){
					reads.count <- reads.count + 1
				}
				if((range.min <= num.allele.specific) && (num.allele.specific <= range.max)){
					allele.specific.count <- allele.specific.count + 1
				}
			}
			read.count.frequency.distribution.output.line <- sprintf('%s\t%.0f', read.count.frequency.distribution.output.line, reads.count)
			allele.specific.read.count.frequency.distribution.output.line <- sprintf('%s\t%.0f', allele.specific.read.count.frequency.distribution.output.line, allele.specific.count)
		}
		cat(sprintf('%s\n', read.count.frequency.distribution.output.line), file=read.count.frequency.distribution.output.file, append=TRUE)
		cat(sprintf('%s\n', allele.specific.read.count.frequency.distribution.output.line), file=allele.specific.read.count.frequency.distribution.output.file, append=TRUE)
	}
				
}


argument.array <- as.matrix(read.table("read.count.frequency.distribution.arguments.file"))
if(length(argument.array) < 3){
	stop("Usage: Expected at least 3 arguments (with each argument on a separate line): RCFD.output.file allele.specific.RCFD.output.file comma.delimited.condition.and.associated.statistics.per.locus.files [ . . . ] ")
}

read.count.frequency.distribution(argument.array[1], argument.array[2], argument.array[3:length(argument.array)])
