#!/usr/bin/python
import os, csv, sys, math
import xml.dom.minidom
import networkx

#In this program, associated proteins will include proteins that associated theit children

def checkDup(P_list):
	PTp = P_list
	PTp.sort()
	for i in (range(len(PTp)-1)):
		if PTp[i]==PTp[i+1]:
			return 1
	return 0
def hyperGeo_pr(p_x,p_k,p_m,p_n):
	"""Function to calculate the probability
	input:
		p_x: number of white balls drawn.
		p_k: number of balls drawn (note p_x is not larger than p_k).
		p_m: number of white balls in the urn.
		p_n: number of black balls in the urn.
	output:
		the probability that has p_x white balls in a drawing of p_k balls.
	"""
	l_nue = []
	l_deno = []
	for j in range(p_x):
		l_nue.append(p_m-j)
		l_deno.append(j+1)
	for j in range(p_k-p_x):
		l_nue.append(p_n-j)
		l_deno.append(j+1)
	for j in range(p_k):
		l_deno.append(p_m+p_n-j)
		l_nue.append(j+1)
	l_deno.sort()
	l_nue.sort()
	#l_p = 1.0
	l_p2 = 0.0

	try:
		for j in range(len(l_deno)):
			#l_p = l_p*l_nue[j]/l_deno[j]
			l_p2 += math.log(l_nue[j]) - math.log(l_deno[j])
	
	except:
		return 0
	
	#return math.exp(l_p)
	return math.exp(l_p2)

def hyperGeo_Pvalue(white_ball_drawn,total_ball_drawn,white_ball_in_urn,total_ball_in_urn):
	"""Function to calculate the p_value
	input:
		white_ball_drawn: number of white balls drawn.
		white_ball_in_urn: number of white balls in the urn.
		total_ball_in_urn: number of black balls in the urn.
		total_ball_drawn: number of balls drawn (note white_ball_drawn is not larger than total_ball_drawn).
	output:
		1-the probability that has at least white_ball_drawn white balls in a drawing of total_ball_drawn balls.
	"""
	l_p = 0.0
	for j in range(white_ball_drawn):
		hptp = hyperGeo_pr(j,total_ball_drawn,white_ball_in_urn,total_ball_in_urn-white_ball_in_urn)
		l_p += hptp
	if l_p>1:
		l_p = 1.0
	return (1-l_p)

def Union(P_list1, P_list2):
	L_rets = []
	L_dic = {}
	for item in P_list1:
		L_rets.append(item)
		L_dic[item] = 1
	for item in P_list2:
		if not(L_dic.has_key(item)):
			L_rets.append(item)
	L_rets.sort()
	return L_rets

def Intersection(P_list1, P_list2):
	L_rets = []
	L_dic = {}
	for item in P_list1:
		L_dic[item] = 1
	for item in P_list2:
		if L_dic.has_key(item):
			L_rets.append(item)
	L_rets.sort()
	return L_rets

def Difference(P_list1, P_list2):
	#return P_list2 - P_list1
	L_rets = []
	L_dic = {}
	for item in P_list1:
		L_dic[item] = 1
	for item in P_list2:
		if not(L_dic.has_key(item)):
			L_rets.append(item)
	L_rets.sort()
	return L_rets

class GoGraph(networkx.DiGraph):
	def __init__(self,P_weightGraph, P_Associate):
		networkx.DiGraph.__init__(self)
		self.association = 'Association_proteins'
		self.association_acc = 'Association_Accumulation'
		self.mapping = 'mapping_protiens'
		self.PV = 'P_value'
		self.TotalLost = 0.0
		self.gene2Goterm = self.Gene2GoTermAssociation(P_Associate)
		self.Goterm2gene = self.GoTerm2GeneAssociation(P_Associate)
		self.NumberOfAllProtein = len(self.gene2Goterm)
		self.createDiGraph(P_weightGraph)

	def readWeights(self, P_filename = ''):
		'''
		read edge-weighted go structure data
		'''
		xmldoc = xml.dom.minidom.parse(P_filename)
		xmlData = xmldoc.firstChild
		goTermList = xmlData.getElementsByTagName('term')

		goGraphRe = {}
		count =0
		for term in goTermList:
			count += 0
			if count<10:
				go = term.attributes['id']
				goGraphRe[go.value] = {}
				parentList = term.getElementsByTagName('parent')
				for par in parentList:
					parGO = par.attributes['id']
					goGraphRe[go.value][parGO.value] = {}
					edgeList = par.getElementsByTagName('edge')
					for edge in edgeList:
						edgeType = edge.attributes['type']
						goGraphRe[go.value][parGO.value] = float(edge.firstChild.data)
						#print go.value, parGO.value, float(edge.firstChild.data)
		return goGraphRe

	def createDiGraph(self, P_filename):
		'''
		create edge-weighted Goterm structure
		'''
		GoStructure = self.readWeights(P_filename)
		for chtp in GoStructure:
			for pa in GoStructure[chtp]:
				self.add_edge(chtp,pa)
				self.edge[chtp][pa]['weight'] = GoStructure[chtp][pa]
		self.node_AssociationProtienData()
		self.node_Depth()

	def Gene2GoTermAssociation(self, P_fName):
		'''
		create gene-goterm association relation
		'''
		L_rets = {}
		L_retNoDup = {}
		L_fileLipid = open(P_fName)
		for line in L_fileLipid:
			line2 = line.split('\t')
			if len(line2)>10:
				if L_rets.has_key(line2[2]):
					L_rets[line2[2]][line2[4]] = 1
				else:
					L_rets[line2[2]] = {line2[4]:1}
		for gene in L_rets:
			GList = []
			for Gtemp in L_rets[gene]:
				GList.append(Gtemp)
			L_retNoDup[gene] = GList
		return L_retNoDup

	def GoTerm2GeneAssociation(self, P_fName):
		'''
		create goterm-gene association relation
		'''
		L_rets = {}
		L_retNoDup = {}
		L_fileLipid = open(P_fName)
		for line in L_fileLipid:
			line2 = line.split('\t')
			if len(line2)>10:
				if L_rets.has_key(line2[4]):
					L_rets[line2[4]][line2[2]] = 1
				else:
					L_rets[line2[4]] = {line2[2]:1}
		for Gtp in L_rets:
			GeneList = []
			for gene in L_rets[Gtp]:
				GeneList.append(gene)
			L_retNoDup[Gtp] = GeneList

		return L_retNoDup

	def propagate_AssociatedProteins(self):
		Top_sort_nodes = networkx.topological_sort(self)
		for nodeTp in Top_sort_nodes:
			ParentTp = self.successors(nodeTp)
			AssoSelf = self.node[nodeTp][self.association_acc]
			for nodePare in ParentTp:
				AssoPare = self.node[nodePare][self.association_acc]
				#print nodeTp,nodePare,AssoSelf,AssoPare
				UnionTwo = Union(AssoSelf,AssoPare)
				self.node[nodePare][self.association_acc] = UnionTwo

	def node_AssociationProtienData(self):
		for gotermTp in self.nodes():
			self.node[gotermTp]['level'] = 1000000000
			if self.Goterm2gene.has_key(gotermTp):
				self.node[gotermTp][self.association] = self.Goterm2gene[gotermTp]
				self.node[gotermTp][self.association_acc] = self.Goterm2gene[gotermTp]
			else:
				self.node[gotermTp][self.association] = []
				self.node[gotermTp][self.association_acc] = []
			self.node[gotermTp][self.mapping] = []

		self.propagate_AssociatedProteins()

	def node_Depth(self):
		Top_sort_nodes = networkx.topological_sort(self)
		OrderReverse=[]
		for i in range(len(Top_sort_nodes)):
			OrderReverse.append(Top_sort_nodes[len(Top_sort_nodes)-i-1])
		
		self.node[OrderReverse[0]]['level'] = 1
		for nodeTp in OrderReverse:
			ChildrenTp = self.predecessors(nodeTp)
			for chd in ChildrenTp:
				if self.node[chd]['level']>self.node[nodeTp]['level']+1:
					self.node[chd]['level']=self.node[nodeTp]['level']+1
		
	def symplify_Goterm_Structure(self):
		Top_Order = networkx.topological_sort(self)
		for nodeTp in Top_Order:
			if len(self.node[nodeTp][self.mapping])==0 and len(self.predecessors(nodeTp)) == 0:
				self.remove_node(nodeTp)

	def node_Mapping_ProteinList_PV1(self, P_protein_InList):
		Gene_List_Associated = []
		for gene in P_protein_InList:
			if self.gene2Goterm.has_key(gene):
				Gene_List_Associated.append(gene)
		self.NumberOfProtein_InList = len(Gene_List_Associated)
		
		for nodeID in self.nodes():
			P_Association=self.node[nodeID][self.association]
			P_protein = Intersection(P_protein_InList,P_Association)
			self.node[nodeID][self.mapping] = P_protein
			newPV = hyperGeo_Pvalue(len(P_protein),self.NumberOfProtein_InList, len(P_Association),self.NumberOfAllProtein)
			self.node[nodeID][self.PV] = newPV
			
			#---
			if checkDup(P_Association)==1:
				print 'Asso dup'
			if checkDup(P_protein_InList)==1:
				print 'PList dup'
			if checkDup(P_protein)==1:
				print 'Intersection dup'

	def node_Mapping_ProteinList_PV2(self, P_protein_InList):
		Gene_List_Associated = []
		for gene in P_protein_InList:
			if self.gene2Goterm.has_key(gene):
				Gene_List_Associated.append(gene)
		self.NumberOfProtein_InList = len(Gene_List_Associated)

		for nodeID in self.nodes():
			P_Association=self.node[nodeID][self.association]
			P_Association2=self.node[nodeID][self.association_acc]
			P_protein = Intersection(P_protein_InList,P_Association)
			self.node[nodeID][self.mapping] = P_protein
				
			newPV = hyperGeo_Pvalue(len(P_protein),self.NumberOfProtein_InList, len(P_Association2),self.NumberOfAllProtein)
			self.node[nodeID][self.PV] = newPV
			
			#---
			if checkDup(P_Association)==1:
				print 'Asso dup'
			if checkDup(P_protein_InList)==1:
				print 'PList dup'
			if checkDup(P_protein)==1:
				print 'Intersection dup'


	def calculate_lost(self, P_nodeSource, P_nodeTarget):
		edgeWeight = self.edge[P_nodeSource][P_nodeTarget]['weight']
		L_lost = edgeWeight*len(self.node[P_nodeSource][self.mapping])
		#print 'edgeWeight=',edgeWeight, 'sizeOfSource=',len(self.node[P_nodeSource][self.mapping]),'lost=',L_lost
		return L_lost

	def merge_nodes(self, P_nodeSource, P_nodeTarget):
		'''
		merge association protien in source node into target node,
		update the P_value of the target node, and delete source node from the graph
		'''
		L_lost = self. calculate_lost(P_nodeSource, P_nodeTarget)
		P_Source = self.node[P_nodeSource][self.association]
		P_Target = self.node[P_nodeTarget][self.association]
		self.node[P_nodeTarget][self.association] = Union(P_Source, P_Target)

		#---
		if checkDup(self.node[P_nodeTarget][self.association]) == 1:
			print '---Asso dup'

		P_Source2 = self.node[P_nodeSource][self.mapping]
		P_Target2 = self.node[P_nodeTarget][self.mapping]
		self.node[P_nodeTarget][self.mapping] = Union(P_Source2, P_Target2)

		#---
		if len(self.node[P_nodeTarget][self.mapping])>self.NumberOfProtein_InList:
			print len(P_Source2), len(P_Target2)
		if checkDup(self.node[P_nodeTarget][self.mapping]) ==1:
			print '!!! mapping Dup'

		#print 'All protiens=',self.NumberOfAllProtein, 'All white protien#=',len(self.node[P_nodeTarget][self.association])
		#print 'whiteDraw=',len(self.node[P_nodeTarget][self.mapping]), 'Alldraw = ',self.NumberOfProtein_InList
		newPV = hyperGeo_Pvalue(len(self.node[P_nodeTarget][self.mapping]),self.NumberOfProtein_InList, len(self.node[P_nodeTarget][self.association]),self.NumberOfAllProtein)
		self.node[P_nodeTarget][self.PV] = newPV
		self.TotalLost += L_lost
		Children_S = self.predecessors(P_nodeSource)
		Children_T = self.predecessors(P_nodeTarget)
		C_S_sub_T = Difference(Children_T,Children_S)
		#print 'Children of S not of T=',C_S_sub_T
		for proteinTp in C_S_sub_T:
			self.add_edge(proteinTp,P_nodeTarget)
			self.edge[proteinTp][P_nodeTarget]['weight'] = self.edge[proteinTp][P_nodeSource]['weight'] + self.edge[P_nodeSource][P_nodeTarget]['weight']
		self.remove_node(P_nodeSource)


	def merge_nodes2(self, P_nodeSource, P_nodeTarget):
		'''
		merge association protien in source node into target node,
		update the P_value of the target node, and delete source node from the graph
		'''
		L_lost = self. calculate_lost(P_nodeSource, P_nodeTarget)
		P_Source = self.node[P_nodeSource][self.association]
		P_Target = self.node[P_nodeTarget][self.association]
		self.node[P_nodeTarget][self.association] = Union(P_Source, P_Target)

		#---
		if checkDup(self.node[P_nodeTarget][self.association]) == 1:
			print '---Asso dup'

		P_Source2 = self.node[P_nodeSource][self.mapping]
		P_Target2 = self.node[P_nodeTarget][self.mapping]
		self.node[P_nodeTarget][self.mapping] = Union(P_Source2, P_Target2)

		#---
		if len(self.node[P_nodeTarget][self.mapping])>self.NumberOfProtein_InList:
			print len(P_Source2), len(P_Target2)
		if checkDup(self.node[P_nodeTarget][self.mapping]) ==1:
			print '!!! mapping Dup'

		#print 'All protiens=',self.NumberOfAllProtein, 'All white protien#=',len(self.node[P_nodeTarget][self.association])
		#print 'whiteDraw=',len(self.node[P_nodeTarget][self.mapping]), 'Alldraw = ',self.NumberOfProtein_InList
		newPV = hyperGeo_Pvalue(len(self.node[P_nodeTarget][self.mapping]),self.NumberOfProtein_InList, len(self.node[P_nodeTarget][self.association_acc]),self.NumberOfAllProtein)
		self.node[P_nodeTarget][self.PV] = newPV
		self.TotalLost += L_lost
		Children_S = self.predecessors(P_nodeSource)
		Children_T = self.predecessors(P_nodeTarget)
		C_S_sub_T = Difference(Children_T,Children_S)
		#print 'Children of S not of T=',C_S_sub_T
		for proteinTp in C_S_sub_T:
			self.add_edge(proteinTp,P_nodeTarget)
			self.edge[proteinTp][P_nodeTarget]['weight'] = self.edge[proteinTp][P_nodeSource]['weight'] + self.edge[P_nodeSource][P_nodeTarget]['weight']
		self.remove_node(P_nodeSource)


	def findNodemWithMaxPvalue(self):
		'''
		search the goterm structure and retrun number of nodes that is not empty and the node with maximum P_value
		'''
		L_maxPV = -1
		L_NonEmpty = 0
		L_NodeMax = 'non'
		for nodeTp in self.nodes():
			if self.node[nodeTp][self.PV]>L_maxPV and nodeTp!='GO:0008150':
				L_maxPV = self.node[nodeTp][self.PV]
				L_NodeMax = nodeTp
			if len(self.node[nodeTp][self.mapping])>0:
				L_NonEmpty += 1
		return [L_NonEmpty,L_NodeMax,L_maxPV]

	def findNodemWithMaxPvalue_Leave(self):
		'''
		search the goterm structure and retrun number of nodes that is not empty and the node with maximum P_value
		'''
		L_maxPV = -1
		L_NonEmpty = 0
		L_NodeMax = 'non'
		for nodeTp in self.nodes():
			if self.node[nodeTp][self.PV]>L_maxPV and nodeTp!='GO:0008150' and len(self.predecessors(nodeTp)) == 0:
				L_maxPV = self.node[nodeTp][self.PV]
				L_NodeMax = nodeTp
			if len(self.node[nodeTp][self.mapping])>0:
				L_NonEmpty += 1
		return [L_NonEmpty,L_NodeMax,L_maxPV]

	def findNodemWithMaxPvalue2(self,P_minProteinNo):
		'''
		search the goterm structure and retrun number of nodes that is not empty and the node with maximum P_value
		'''
		L_maxPV = -1
		L_NonEmpty = 0
		L_NodeMax = 'non'
		for nodeTp in self.nodes():
			if self.node[nodeTp][self.PV]>L_maxPV and len(self.node[nodeTp][self.mapping])<P_minProteinNo  and nodeTp!='GO:0008150':
				L_maxPV = self.node[nodeTp][self.PV]
				L_NodeMax = nodeTp
			if len(self.node[nodeTp][self.mapping])>0:
				L_NonEmpty += 1
		return [L_NonEmpty,L_NodeMax,L_maxPV]

	def printResult(self):
		'''
		print 'The total information lost is:', self.TotalLost
		print 'The final goterm and associated protein list:'
		for gotp in self.nodes():
			if len(self.node[gotp][self.mapping])>0:
				print gotp,':',self.node[gotp][self.mapping]
		'''
		L_Goterms = {}
		for gotp in self.nodes():
			if len(self.node[gotp][self.mapping])>0:
				L_Goterms[gotp] = [self.node[gotp][self.mapping],self.node[gotp]['level']]
		return [self.TotalLost,L_Goterms]

	def printResult_Leave(self):
		'''
		print 'The total information lost is:', self.TotalLost
		print 'The final goterm and associated protein list:'
		for gotp in self.nodes():
			if len(self.node[gotp][self.mapping])>0:
				print gotp,':',self.node[gotp][self.mapping]
		'''
		L_Goterms = {}
		for gotp in self.nodes():
			if len(self.node[gotp][self.mapping])>0:
				if len(self.predecessors(gotp)) == 0:
					L_Goterms[gotp] = [self.node[gotp][self.mapping],self.node[gotp]['level']]
		return [self.TotalLost,L_Goterms]


	def gotermSummarization_1(self, P_geneList, P_value):
		'''
		use P_GotermNumber to summarizate a given list of proteins
		'''

		self.node_Mapping_ProteinList_PV2(P_geneList)
		self.symplify_Goterm_Structure()

		Status = self.findNodemWithMaxPvalue()
		while Status[0]>1 and Status[2]>P_value:
			try:
				L_parrents = self.successors(Status[1])
				L_MinWeight = 100000;L_MinPa = 'non'
				for nodetp in L_parrents:
					if self.edge[Status[1]][nodetp]['weight']<L_MinWeight:
						L_MinWeight = self.edge[Status[1]][nodetp]['weight']
						L_MinPa = nodetp
				if L_MinPa == 'non':
					print '---',Status[1],L_MinPa,L_parrents
				else:
					self. merge_nodes2(Status[1],L_MinPa)
			except:
				print 'Error---',Status
			Status = self. findNodemWithMaxPvalue()

		return self.printResult()


	def gotermSummarization(self, P_geneList, P_value, P_minProteinNo):
		'''
		use P_GotermNumber to summarizate a given list of proteins
		'''

		self.node_Mapping_ProteinList_PV2(P_geneList)
		self.symplify_Goterm_Structure()

		Status = self.findNodemWithMaxPvalue()
		while Status[1]!='non' and Status[2]>P_value:
			try:
				L_parrents = self.successors(Status[1])
				L_MinWeight = 100000;L_MinPa = 'non'
				for nodetp in L_parrents:
					if self.edge[Status[1]][nodetp]['weight']<L_MinWeight:
						L_MinWeight = self.edge[Status[1]][nodetp]['weight']
						L_MinPa = nodetp
				if L_MinPa == 'non':
					print '+++1',Status[1],L_MinPa,L_parrents
				else:
					self. merge_nodes2(Status[1],L_MinPa)
			except:
				print 'Error---1',Status

			Status = self.findNodemWithMaxPvalue()

		Status = self.findNodemWithMaxPvalue2(P_minProteinNo)
		countTemp = 0
		while Status[1]!='non':
			try:
				L_parrents = self.successors(Status[1])
				L_MinWeight = 100000;L_MinPa = 'non'
				for nodetp in L_parrents:
					if self.edge[Status[1]][nodetp]['weight']<L_MinWeight:
						L_MinWeight = self.edge[Status[1]][nodetp]['weight']
						L_MinPa = nodetp
				if L_MinPa == 'non':
					print '+++2',countTemp,Status[1],L_MinPa,L_parrents
					countTemp += 1
				else:
					self. merge_nodes2(Status[1],L_MinPa)
			except:
				print 'Error---2',countTemp,Status

			Status = self.findNodemWithMaxPvalue2(P_minProteinNo)


		return self.printResult()



def readExpressionData(p_fName):

	fileN = open(p_fName,'r')
	count = 0
	knock = ''
	ret = {}
	allGene = {}
	L_rets = {}
	L_allGene = []
	for line in fileN:
		lineTp = line.split('\t')
		knock = lineTp[1]; gene = lineTp[2]
		allGene[gene] = 1
		if ret.has_key(knock):
			ret[knock][gene]=1
		else:
			ret[knock] = {gene:1}

	for inst in ret:
		coGenes = []
		for gene in ret[inst]:
			coGenes.append(gene)
		L_rets[inst] = coGenes
	for gene in allGene:
		L_allGene.append(gene)
	return [L_rets,L_allGene]

def printResult(P_goTerm,P_result):
	output = 'Instance>'+P_goTerm
	output += '>'+str(P_result[0])+'\nBegin\n'
	for item in P_result[1]:
		output += item+'>'+str(P_result[1][item][1])+'>'
		for gene in P_result[1][item][0]:
			output += gene+':'
		output += '\n'
	output += 'End\n\n'
	return output
	

'''
------------------------------------------------------------------------------------------------------
#Example of using the class GoGraph:

#Data of Go Ontology structure and gene_Goterm association
weightGographData = 'newWeightedPubMedGO.xml'
geneGotermAssociationData = 'gene_association.sgd'

#Create a GoGraph object (Node: every time you use the gotermSummarization(), you need to create a new object)
G = GoGraph(weightGographData,geneGotermAssociationData)

#A list of genes are summarized
GeneList = ['ADH4', 'AFR1', 'AGA1', 'AGA2', 'AQR1', 'ARO10', 'ASG7', 'ASP3-3', 'AXL1', 'BAR1', 'BEM2', 'BNA1', 'CDC20', 'CDC24', 'CHS1', 'CHS7', 'CIK1', 'CLB1', 'CPR8', 'DDR48', 'ECM18', 'ENA1', 'ERG24', 'ERG6', 'FAA3', 'FAR1', 'FIG1', 'FIG2', 'FIT2', 'FRE2', 'FUS1', 'FUS2', 'FUS3', 'GAS2', 'GAT1', 'GFA1', 'GIC2', 'GIT1', 'GPA1', 'GSC2', 'GYP8', 'HOF1', 'HOR2', 'HO', 'HSP31', 'HST3', 'HYM1', 'ICL1', 'ICS2', 'IDH1', 'KAR4', 'KAR5', 'KRE6', 'KSS1', 'KTR2', 'LEU2', 'LSB3', 'LYS2', 'MDJ2', 'MET1', 'MID2', 'MNN1', 'MNT2', 'MRH1', 'MSB2', 'MSG5', 'NDE1', 'NDJ1', 'NRM1', 'PCL2', 'PCL7', 'PCL9', 'PET9', 'PGU1', 'PHD1', 'PHO89', 'PRM10', 'PRM1', 'PRM2', 'PRM3', 'PRM4', 'PRM5', 'PRM6', 'PRM8', 'PRM9', 'PRP39', 'PRY2', 'PST1', 'RAX2', 'RDI1', 'RHR2', 'SAM35', 'SCW10', 'SIL1', 'SLI15', 'SRL1', 'SRL3', 'SST2', 'STE12', 'STE2', 'STE4', 'SUP45', 'SVS1', 'TAT1', 'TGS1', 'TIP1', 'TYE7', 'UTR2', 'WSC2', 'WSC3', 'YAR009C', 'YAR068W', 'YBR012W-B', 'YBR071W', 'YCL056C', 'YDR124W', 'YDR249C', 'YDR366C', 'YER138C', 'YER158C', 'YER160C', 'YGR122W', 'YHB1', 'YIL080W', 'YIL082W-A', 'YIL083C', 'YIL169C', 'YJR027W', 'YJR029W', 'YLL054C', 'YLR042C', 'YLR108C', 'YLR414C', 'YML039W', 'YML119W', 'YMR045C', 'YMR050C', 'YNL208W', 'YRO2', 'ZIP1']


#Using Go term to summarize the list of gene.

Result =  G.gotermSummarization(GeneList,0.05,5)
#0.05 is the threshold of P_Value of the Go term node in final result.
#5 is the minimum number of genes in the Go term node in final result.

#The result has the format: [value_0,{Goterm_1:[a list of genes_1,value_1],Goterm_2:[a list of genes_2,value_2],....}]
# value_0: the total information lost for the summarization.
# Goterm_1: Goterm ID that is used to summarize the given list of gene.
# a list of genes_1: a subset of genes (in the given list of gene) that are annotated by Goterm_1.
# value_1: the level of Goterm_1 on the Go Ontology. The root note is in level 1.

------------------------------------------------------------------------------
'''