#!/usr/bin/python
import os, csv, sys, math, time
import xml.dom.minidom
import networkx

#In this program, associated proteins will include proteins that associated theit children

def checkDup(P_list):
	PTp = P_list
	PTp.sort()
	for i in (range(len(PTp)-1)):
		if PTp[i]==PTp[i+1]:
			return 1
	return 0
def hyperGeo_pr(p_x,p_k,p_m,p_n):
	"""Function to calculate the probability
	input:
		p_x: number of white balls drawn.
		p_k: number of balls drawn (note p_x is not larger than p_k).
		p_m: number of white balls in the urn.
		p_n: number of black balls in the urn.
	output:
		the probability that has p_x white balls in a drawing of p_k balls.
	"""
	l_nue = []
	l_deno = []
	for j in range(p_x):
		l_nue.append(p_m-j)
		l_deno.append(j+1)
	for j in range(p_k-p_x):
		l_nue.append(p_n-j)
		l_deno.append(j+1)
	for j in range(p_k):
		l_deno.append(p_m+p_n-j)
		l_nue.append(j+1)
	l_deno.sort()
	l_nue.sort()
	#l_p = 1.0
	l_p2 = 0.0

	try:
		for j in range(len(l_deno)):
			#l_p = l_p*l_nue[j]/l_deno[j]
			l_p2 += math.log(l_nue[j]) - math.log(l_deno[j])
	
	except:
		return 0
	
	#return math.exp(l_p)
	return math.exp(l_p2)

def hyperGeo_Pvalue(white_ball_drawn,total_ball_drawn,white_ball_in_urn,total_ball_in_urn):
	"""Function to calculate the p_value
	input:
		white_ball_drawn: number of white balls drawn.
		white_ball_in_urn: number of white balls in the urn.
		total_ball_in_urn: number of black balls in the urn.
		total_ball_drawn: number of balls drawn (note white_ball_drawn is not larger than total_ball_drawn).
	output:
		1-the probability that has at least white_ball_drawn white balls in a drawing of total_ball_drawn balls.
	"""
	l_p = 0.0
	for j in range(white_ball_drawn):
		hptp = hyperGeo_pr(j,total_ball_drawn,white_ball_in_urn,total_ball_in_urn-white_ball_in_urn)
		l_p += hptp
	if l_p>1:
		l_p = 1.0
	return (1-l_p)

def Union(P_list1, P_list2):
	L_rets = []
	L_dic = {}
	for item in P_list1:
		L_rets.append(item)
		L_dic[item] = 1
	for item in P_list2:
		if not(L_dic.has_key(item)):
			L_rets.append(item)
	L_rets.sort()
	return L_rets

def Intersection(P_list1, P_list2):
	L_rets = []
	L_dic = {}
	for item in P_list1:
		L_dic[item] = 1
	for item in P_list2:
		if L_dic.has_key(item):
			L_rets.append(item)
	L_rets.sort()
	return L_rets

def Difference(P_list1, P_list2):
	#return P_list2 - P_list1
	L_rets = []
	L_dic = {}
	for item in P_list1:
		L_dic[item] = 1
	for item in P_list2:
		if not(L_dic.has_key(item)):
			L_rets.append(item)
	L_rets.sort()
	return L_rets

class GotermEnrich(networkx.DiGraph):
	def __init__(self,P_weightGraph, P_Associate):
		networkx.DiGraph.__init__(self)
		self.association = 'Association_proteins'
		self.association_acc = 'Association_Accumulation'
		self.mapping = 'mapping_protiens'
		self.PV = 'P_value'
		self.TotalLost = 0.0
		self.gene2Goterm = self.Gene2GoTermAssociation(P_Associate)
		self.Goterm2gene = self.GoTerm2GeneAssociation(P_Associate)
		self.NumberOfAllProtein = len(self.gene2Goterm)
		self.createDiGraph(P_weightGraph)

	def readWeights(self, P_filename = ''):
		'''
		read edge-weighted go structure data
		'''
		xmldoc = xml.dom.minidom.parse(P_filename)
		xmlData = xmldoc.firstChild
		goTermList = xmlData.getElementsByTagName('term')

		goGraphRe = {}
		count =0
		for term in goTermList:
			count += 0
			if count<10:
				go = term.attributes['id']
				goGraphRe[go.value] = {}
				parentList = term.getElementsByTagName('parent')
				for par in parentList:
					parGO = par.attributes['id']
					goGraphRe[go.value][parGO.value] = {}
					edgeList = par.getElementsByTagName('edge')
					for edge in edgeList:
						edgeType = edge.attributes['type']
						goGraphRe[go.value][parGO.value] = float(edge.firstChild.data)
						#print go.value, parGO.value, float(edge.firstChild.data)
		return goGraphRe

	def createDiGraph(self, P_filename):
		'''
		create edge-weighted Goterm structure
		'''
		GoStructure = self.readWeights(P_filename)
		for chtp in GoStructure:
			for pa in GoStructure[chtp]:
				self.add_edge(chtp,pa)
				self.edge[chtp][pa]['weight'] = GoStructure[chtp][pa]
		self.node_AssociationProtienData()
		self.node_Depth()

	def Gene2GoTermAssociation(self, P_fName):
		'''
		create gene-goterm association relation
		'''
		L_rets = {}
		L_retNoDup = {}
		L_fileLipid = open(P_fName)
		for line in L_fileLipid:
			line2 = line.split('\t')
			if len(line2)>10:
				if L_rets.has_key(line2[2]):
					L_rets[line2[2]][line2[4]] = 1
				else:
					L_rets[line2[2]] = {line2[4]:1}
		for gene in L_rets:
			GList = []
			for Gtemp in L_rets[gene]:
				GList.append(Gtemp)
			L_retNoDup[gene] = GList
		return L_retNoDup

	def GoTerm2GeneAssociation(self, P_fName):
		'''
		create goterm-gene association relation
		'''
		L_rets = {}
		L_retNoDup = {}
		L_fileLipid = open(P_fName)
		for line in L_fileLipid:
			line2 = line.split('\t')
			if len(line2)>10:
				if L_rets.has_key(line2[4]):
					L_rets[line2[4]][line2[2]] = 1
				else:
					L_rets[line2[4]] = {line2[2]:1}
		for Gtp in L_rets:
			GeneList = []
			for gene in L_rets[Gtp]:
				GeneList.append(gene)
			L_retNoDup[Gtp] = GeneList

		return L_retNoDup

	def propagate_AssociatedProteins(self):
		Top_sort_nodes = networkx.topological_sort(self)
		for nodeTp in Top_sort_nodes:
			ParentTp = self.successors(nodeTp)
			AssoSelf = self.node[nodeTp][self.association_acc]
			for nodePare in ParentTp:
				AssoPare = self.node[nodePare][self.association_acc]
				#print nodeTp,nodePare,AssoSelf,AssoPare
				UnionTwo = Union(AssoSelf,AssoPare)
				self.node[nodePare][self.association_acc] = UnionTwo

	def node_AssociationProtienData(self):
		for gotermTp in self.nodes():
			self.node[gotermTp]['level'] = 1000000000
			if self.Goterm2gene.has_key(gotermTp):
				self.node[gotermTp][self.association] = self.Goterm2gene[gotermTp]
				self.node[gotermTp][self.association_acc] = self.Goterm2gene[gotermTp]
			else:
				self.node[gotermTp][self.association] = []
				self.node[gotermTp][self.association_acc] = []
			self.node[gotermTp][self.mapping] = []

		self.propagate_AssociatedProteins()

	def node_Depth(self):
		Top_sort_nodes = networkx.topological_sort(self)
		OrderReverse=[]
		for i in range(len(Top_sort_nodes)):
			OrderReverse.append(Top_sort_nodes[len(Top_sort_nodes)-i-1])
		
		self.node[OrderReverse[0]]['level'] = 1
		for nodeTp in OrderReverse:
			ChildrenTp = self.predecessors(nodeTp)
			for chd in ChildrenTp:
				if self.node[chd]['level']>self.node[nodeTp]['level']+1:
					self.node[chd]['level']=self.node[nodeTp]['level']+1
		
	def symplify_Goterm_StructureNoDeletion(self,P_pv,P_size):
		Top_Order = networkx.topological_sort(self)
		for nodeTp in Top_Order:
			Children = self.predecessors(nodeTp)
			self.node[nodeTp]['leaf'] = 2
			tempSt = 1
			for child in Children:
				if self.node[child]['leaf'] > 0:
					tempSt = 0;break
			if tempSt == 1:
				if len(self.node[nodeTp][self.mapping])>=P_size and self.node[nodeTp][self.PV]<=P_pv:
					self.node[nodeTp]['leaf'] = 1
				else:
					self.node[nodeTp]['leaf'] = 0
				



	def printResult(self, P_pvThreshold):
		L_Goterms = {}
		for gotp in self.nodes():
			if len(self.node[gotp][self.mapping])>0 and self.node[gotp][self.PV]<=P_pvThreshold:
				L_Goterms[gotp] = [self.node[gotp][self.mapping],self.node[gotp]['level'],self.node[gotp][self.PV]]
		return L_Goterms



	def Goterm_enrich(self, P_protein_InList, P_pvThreshold, P_topRank,P_size):
		Gene_List_Associated = []
		PV_Score = {}
		L_retTemp =[]
		for gene in P_protein_InList:
			if self.gene2Goterm.has_key(gene):
				Gene_List_Associated.append(gene)
		self.NumberOfProtein_InList = len(Gene_List_Associated)

		for nodeID in self.nodes():
			P_Association=self.node[nodeID][self.association]
			P_Association2=self.node[nodeID][self.association_acc]
			P_protein = Intersection(P_protein_InList,P_Association2)
			self.node[nodeID][self.mapping] = P_protein
				
			newPV = hyperGeo_Pvalue(len(P_protein),self.NumberOfProtein_InList, len(P_Association2),self.NumberOfAllProtein)
			self.node[nodeID][self.PV] = newPV
			'''
			if newPV<=P_pvThreshold:
				L_retTemp.append([nodeID,P_protein,newPV])
				PV_Score[newPV]=1
			'''
			
			#---
			if checkDup(P_Association)==1:
				print 'Asso dup'
			if checkDup(P_protein_InList)==1:
				print 'PList dup'
			if checkDup(P_protein)==1:
				print 'Intersection dup'
		self.symplify_Goterm_StructureNoDeletion(P_pvThreshold,P_size)
		for nodeID in self.nodes():
			if self.node[nodeID]['leaf'] == 1:
				L_retTemp.append([nodeID,self.node[nodeID][self.mapping],self.node[nodeID][self.PV],self.node[nodeID]['level']])
				PV_Score[self.node[nodeID][self.PV]]=1

		PV_ScoreSort = []
		for item in PV_Score:
			PV_ScoreSort.append(item)
		PV_ScoreSort.sort()

		idxTp = P_topRank-1
		if idxTp>=len(PV_ScoreSort)-1:
			idxTp = len(PV_ScoreSort)-1
		ScoreUp = -1
		if idxTp>0:
			ScoreUp = PV_ScoreSort[idxTp]
		
		L_ret = []
		for item in L_retTemp:
			if item[2]<=ScoreUp:
				L_ret.append(item)
				#print item


		return L_ret



'''
#Example of using class GotermEnrich

#Generate an object of the GotermEnrich class
weightGographData = './data/newWeightedPubMedGO.xml'
geneGotermAssociationData = './data/gene_association.sgd'
G = GotermEnrich(weightGographData,geneGotermAssociationData)

#two list needed to be processed
list1= ['FUS3', 'RAD6', 'STE24', 'DIG2', 'DIG1', 'STE11', 'STE12', 'STE7', 'STE4', 'STE5', 'STE18', 'KSS1']
list2= ['YER044C', 'ERG3', 'CMD1', 'ERG11', 'YMR031W-A', 'SWI4', 'TUNICAMYCIN']


#call the Go term enriched function (Parameters: list1--a list of genes; 0.05--the PValue threshold;
#        2--number of top score (PValue) Go terms that are outputed. 5--Minimum number of genes in the Go terms of solutions)
#The return value of Goterm_enrich() is a list, where each element is also a list 
#               [goterm ID, a list of genes annotated with this goterm, PValue, the level of go term in the Ontology structure].
result1 = G.Goterm_enrich(list1,0.05,2,5)
result2 = G.Goterm_enrich(list2,0.05,2,3)

'''