# DE.fishers.exact.test.R

source("./upper.quartile.normalization.R")

"DE.fishers.exact.test" <- function(locus.to.read.count.file.1, locus.to.read.count.file.2, pvalue.threshold, output.file){
	data.1 <- get.read.count.per.locus(locus.to.read.count.file.1)
	locus.array.1 <- data.1$locus.array
	read.count.array.1 <- data.1$read.count.array

	data.2 <- get.read.count.per.locus(locus.to.read.count.file.2)
	locus.array.2 <- data.2$locus.array
	read.count.array.2 <- data.2$read.count.array

	## check that the loci and their ordering are the same in both files
	if(length(locus.array.1) != length(locus.array.2)){
		stop(sprintf("loci arrays of different lengths: %d and %d\n", length(locus.array.1), length(locus.array.2)))
	}
	for(i in 1:length(locus.array.1)){
		if(locus.array.1[i] != locus.array.2[i]){
			stop(paste("different locus at same index: ", i, sep=""))
		}
	}
	locus.array <- locus.array.1
	total.number.of.loci <- length(locus.array)



	## for upper-quartile normalization -
	## find ratio between the 75th read count percentiles of the two experiments
	
	data <- get.upper.quartile.read.counts(read.count.array.1, read.count.array.2)
	upper.quartile.1 <- data$upper.quartile.1
	upper.quartile.2 <- data$upper.quartile.2

	cat("DEBUG: output file is", output.file, "\n")


	## Compute adjusted pvalues for differential expression of each locus

	pvalue.array <- c()
	direction.array <- c()
	for(i in 1:total.number.of.loci){
		locus <- locus.array[i]
		read.count.for.locus.on.exp.1 <- read.count.array.1[i]
		read.count.for.locus.on.exp.2 <- read.count.array.2[i]

		## Should double-check with Dr. Bullard that this is the correct interpretation of
		## the explanation in the 2010 mRNA-seq normalization paper
		GeneTesting <- matrix(c(read.count.for.locus.on.exp.1, upper.quartile.1, read.count.for.locus.on.exp.2, upper.quartile.2), nrow=2)
		#cat(sprintf("DEBUG: %s, %s, %s, %s\n", read.count.for.locus.on.exp.1, upper.quartile.1, read.count.for.locus.on.exp.2, upper.quartile.2))
		result <- fisher.test(GeneTesting, alternative="two.sided")
		pvalue <- result$p.value

		pvalue.array <- c(pvalue.array, pvalue)	

		if((read.count.for.locus.on.exp.1/upper.quartile.1) < (read.count.for.locus.on.exp.2/upper.quartile.2)){
			direction <- 'down'
		}else if((read.count.for.locus.on.exp.1/upper.quartile.1) > (read.count.for.locus.on.exp.2/upper.quartile.2)){
			direction <- 'up'
		}else{
			direction <- 'none'
		}

		direction.array <- c(direction.array, direction)
		#if((i %% 1000) == 0){
		#	cat(i, ",")
		#}
	}
	adjusted.pvalue.array <- p.adjust(pvalue.array, method="BH")


	locus.to.significantly.favored.direction <- array(NA, dim=total.number.of.loci)

	significant.locus.mask <-  (adjusted.pvalue.array < pvalue.threshold)
	for(i in 1:total.number.of.loci){
		locus <- locus.array[i]
		read.count.for.locus.on.exp.1 <- read.count.array.1[i]
		read.count.for.locus.on.exp.2 <- read.count.array.2[i]

		if(significant.locus.mask[i]){
			expected.global.bias.ratio <- 1
			significantly.favored.direction <- direction.array[i]
			if(significantly.favored.direction == "up"){
				# A is favored wrt B in terms of expression of the given locus
				significantly.favored.direction.mod <- 'X'
			}else if(significantly.favored.direction == "down"){
				# B is favored wrt A in terms of expression of the given locus
				significantly.favored.direction.mod <- "Y"
			}else{
				# neither condition is favored in terms of expression of the given locus
				significantly.favored.direction.mod <- "NA"
			}
		}else{
			significantly.favored.direction <- "none"
		}
		locus.to.significantly.favored.direction[i] <- significantly.favored.direction
	}


	## Print differential-expression results for each locus

	cat("locus_name\tcountE1\t75pctE1\tcountE2\t75pctE2\tBH-pval\tdirection_of_E1_with_respect_to_E2\n", file=output.file, append=FALSE)
	for(i in 1:total.number.of.loci){
		locus <- locus.array[i]
		adjusted.pvalue <- adjusted.pvalue.array[i]
		significantly.favored.direction <- locus.to.significantly.favored.direction[i]
		write(sprintf("%s\t%d\t%d\t%d\t%d\t%.3e\t%s", locus, read.count.array.1[i], upper.quartile.1, read.count.array.2[i], upper.quartile.2,
			    adjusted.pvalue, significantly.favored.direction),
		      file=output.file, append=TRUE)
	}
}

"get.read.count.per.locus" <- function(locus.to.read.count.file.1){
	table <- read.table(locus.to.read.count.file.1)
	if((table[1,1] == "locus") || (table[1,1] == "locus_name")){
		table <- table[2:(dim(table)[1]),]
	}
	locus.array <- data.matrix(table[,1])
	read.count.array <- as.numeric(data.matrix(table[,2]))
	list(locus.array=locus.array, read.count.array=read.count.array)
}


## Read arguments from file

argument.array <- as.matrix(read.table("DE.fishers.exact.test.arguments.file"))
if(length(argument.array) != 5){
	stop("Expected 5 arguments: locus.to.read.count.file.1, locus.to.read.count.file.2, pvalue.threshold, output.file\n")
}

locus.to.read.count.file.1 <- argument.array[1]
locus.to.read.count.file.2 <- argument.array[2]
pvalue.threshold <- as.numeric(argument.array[3])
output.file <- argument.array[4]


## Call function to compute differential expression

DE.fishers.exact.test(locus.to.read.count.file.1, locus.to.read.count.file.2, pvalue.threshold, output.file)

