#!/usr/bin/python import os, csv, sys, math import xml.dom.minidom import networkx #In this program, associated proteins will include proteins that associated theit children def checkDup(P_list): PTp = P_list PTp.sort() for i in (range(len(PTp)-1)): if PTp[i]==PTp[i+1]: return 1 return 0 def hyperGeo_pr(p_x,p_k,p_m,p_n): """Function to calculate the probability input: p_x: number of white balls drawn. p_k: number of balls drawn (note p_x is not larger than p_k). p_m: number of white balls in the urn. p_n: number of black balls in the urn. output: the probability that has p_x white balls in a drawing of p_k balls. """ l_nue = [] l_deno = [] for j in range(p_x): l_nue.append(p_m-j) l_deno.append(j+1) for j in range(p_k-p_x): l_nue.append(p_n-j) l_deno.append(j+1) for j in range(p_k): l_deno.append(p_m+p_n-j) l_nue.append(j+1) l_deno.sort() l_nue.sort() #l_p = 1.0 l_p2 = 0.0 try: for j in range(len(l_deno)): #l_p = l_p*l_nue[j]/l_deno[j] l_p2 += math.log(l_nue[j]) - math.log(l_deno[j]) except: return 0 #return math.exp(l_p) return math.exp(l_p2) def hyperGeo_Pvalue(white_ball_drawn,total_ball_drawn,white_ball_in_urn,total_ball_in_urn): """Function to calculate the p_value input: white_ball_drawn: number of white balls drawn. white_ball_in_urn: number of white balls in the urn. total_ball_in_urn: number of black balls in the urn. total_ball_drawn: number of balls drawn (note white_ball_drawn is not larger than total_ball_drawn). output: 1-the probability that has at least white_ball_drawn white balls in a drawing of total_ball_drawn balls. """ l_p = 0.0 for j in range(white_ball_drawn): hptp = hyperGeo_pr(j,total_ball_drawn,white_ball_in_urn,total_ball_in_urn-white_ball_in_urn) l_p += hptp if l_p>1: l_p = 1.0 return (1-l_p) def Union(P_list1, P_list2): L_rets = [] L_dic = {} for item in P_list1: L_rets.append(item) L_dic[item] = 1 for item in P_list2: if not(L_dic.has_key(item)): L_rets.append(item) L_rets.sort() return L_rets def Intersection(P_list1, P_list2): L_rets = [] L_dic = {} for item in P_list1: L_dic[item] = 1 for item in P_list2: if L_dic.has_key(item): L_rets.append(item) L_rets.sort() return L_rets def Difference(P_list1, P_list2): #return P_list2 - P_list1 L_rets = [] L_dic = {} for item in P_list1: L_dic[item] = 1 for item in P_list2: if not(L_dic.has_key(item)): L_rets.append(item) L_rets.sort() return L_rets class GoGraph(networkx.DiGraph): def __init__(self,P_weightGraph, P_Associate): networkx.DiGraph.__init__(self) self.association = 'Association_proteins' self.association_acc = 'Association_Accumulation' self.mapping = 'mapping_protiens' self.PV = 'P_value' self.TotalLost = 0.0 self.gene2Goterm = self.Gene2GoTermAssociation(P_Associate) self.Goterm2gene = self.GoTerm2GeneAssociation(P_Associate) self.NumberOfAllProtein = len(self.gene2Goterm) self.createDiGraph(P_weightGraph) def readWeights(self, P_filename = ''): ''' read edge-weighted go structure data ''' xmldoc = xml.dom.minidom.parse(P_filename) xmlData = xmldoc.firstChild goTermList = xmlData.getElementsByTagName('term') goGraphRe = {} count =0 for term in goTermList: count += 0 if count<10: go = term.attributes['id'] goGraphRe[go.value] = {} parentList = term.getElementsByTagName('parent') for par in parentList: parGO = par.attributes['id'] goGraphRe[go.value][parGO.value] = {} edgeList = par.getElementsByTagName('edge') for edge in edgeList: edgeType = edge.attributes['type'] goGraphRe[go.value][parGO.value] = float(edge.firstChild.data) #print go.value, parGO.value, float(edge.firstChild.data) return goGraphRe def createDiGraph(self, P_filename): ''' create edge-weighted Goterm structure ''' GoStructure = self.readWeights(P_filename) for chtp in GoStructure: for pa in GoStructure[chtp]: self.add_edge(chtp,pa) self.edge[chtp][pa]['weight'] = GoStructure[chtp][pa] self.node_AssociationProtienData() self.node_Depth() def Gene2GoTermAssociation(self, P_fName): ''' create gene-goterm association relation ''' L_rets = {} L_retNoDup = {} L_fileLipid = open(P_fName) for line in L_fileLipid: line2 = line.split('\t') if len(line2)>10: if L_rets.has_key(line2[2]): L_rets[line2[2]][line2[4]] = 1 else: L_rets[line2[2]] = {line2[4]:1} for gene in L_rets: GList = [] for Gtemp in L_rets[gene]: GList.append(Gtemp) L_retNoDup[gene] = GList return L_retNoDup def GoTerm2GeneAssociation(self, P_fName): ''' create goterm-gene association relation ''' L_rets = {} L_retNoDup = {} L_fileLipid = open(P_fName) for line in L_fileLipid: line2 = line.split('\t') if len(line2)>10: if L_rets.has_key(line2[4]): L_rets[line2[4]][line2[2]] = 1 else: L_rets[line2[4]] = {line2[2]:1} for Gtp in L_rets: GeneList = [] for gene in L_rets[Gtp]: GeneList.append(gene) L_retNoDup[Gtp] = GeneList return L_retNoDup def propagate_AssociatedProteins(self): Top_sort_nodes = networkx.topological_sort(self) for nodeTp in Top_sort_nodes: ParentTp = self.successors(nodeTp) AssoSelf = self.node[nodeTp][self.association_acc] for nodePare in ParentTp: AssoPare = self.node[nodePare][self.association_acc] #print nodeTp,nodePare,AssoSelf,AssoPare UnionTwo = Union(AssoSelf,AssoPare) self.node[nodePare][self.association_acc] = UnionTwo def node_AssociationProtienData(self): for gotermTp in self.nodes(): self.node[gotermTp]['level'] = 1000000000 if self.Goterm2gene.has_key(gotermTp): self.node[gotermTp][self.association] = self.Goterm2gene[gotermTp] self.node[gotermTp][self.association_acc] = self.Goterm2gene[gotermTp] else: self.node[gotermTp][self.association] = [] self.node[gotermTp][self.association_acc] = [] self.node[gotermTp][self.mapping] = [] self.propagate_AssociatedProteins() def node_Depth(self): Top_sort_nodes = networkx.topological_sort(self) OrderReverse=[] for i in range(len(Top_sort_nodes)): OrderReverse.append(Top_sort_nodes[len(Top_sort_nodes)-i-1]) self.node[OrderReverse[0]]['level'] = 1 for nodeTp in OrderReverse: ChildrenTp = self.predecessors(nodeTp) for chd in ChildrenTp: if self.node[chd]['level']>self.node[nodeTp]['level']+1: self.node[chd]['level']=self.node[nodeTp]['level']+1 def symplify_Goterm_Structure(self): Top_Order = networkx.topological_sort(self) for nodeTp in Top_Order: if len(self.node[nodeTp][self.mapping])==0 and len(self.predecessors(nodeTp)) == 0: self.remove_node(nodeTp) def node_Mapping_ProteinList_PV1(self, P_protein_InList): Gene_List_Associated = [] for gene in P_protein_InList: if self.gene2Goterm.has_key(gene): Gene_List_Associated.append(gene) self.NumberOfProtein_InList = len(Gene_List_Associated) for nodeID in self.nodes(): P_Association=self.node[nodeID][self.association] P_protein = Intersection(P_protein_InList,P_Association) self.node[nodeID][self.mapping] = P_protein newPV = hyperGeo_Pvalue(len(P_protein),self.NumberOfProtein_InList, len(P_Association),self.NumberOfAllProtein) self.node[nodeID][self.PV] = newPV #--- if checkDup(P_Association)==1: print 'Asso dup' if checkDup(P_protein_InList)==1: print 'PList dup' if checkDup(P_protein)==1: print 'Intersection dup' def node_Mapping_ProteinList_PV2(self, P_protein_InList): Gene_List_Associated = [] for gene in P_protein_InList: if self.gene2Goterm.has_key(gene): Gene_List_Associated.append(gene) self.NumberOfProtein_InList = len(Gene_List_Associated) for nodeID in self.nodes(): P_Association=self.node[nodeID][self.association] P_Association2=self.node[nodeID][self.association_acc] P_protein = Intersection(P_protein_InList,P_Association) self.node[nodeID][self.mapping] = P_protein newPV = hyperGeo_Pvalue(len(P_protein),self.NumberOfProtein_InList, len(P_Association2),self.NumberOfAllProtein) self.node[nodeID][self.PV] = newPV #--- if checkDup(P_Association)==1: print 'Asso dup' if checkDup(P_protein_InList)==1: print 'PList dup' if checkDup(P_protein)==1: print 'Intersection dup' def calculate_lost(self, P_nodeSource, P_nodeTarget): edgeWeight = self.edge[P_nodeSource][P_nodeTarget]['weight'] L_lost = edgeWeight*len(self.node[P_nodeSource][self.mapping]) #print 'edgeWeight=',edgeWeight, 'sizeOfSource=',len(self.node[P_nodeSource][self.mapping]),'lost=',L_lost return L_lost def merge_nodes(self, P_nodeSource, P_nodeTarget): ''' merge association protien in source node into target node, update the P_value of the target node, and delete source node from the graph ''' L_lost = self. calculate_lost(P_nodeSource, P_nodeTarget) P_Source = self.node[P_nodeSource][self.association] P_Target = self.node[P_nodeTarget][self.association] self.node[P_nodeTarget][self.association] = Union(P_Source, P_Target) #--- if checkDup(self.node[P_nodeTarget][self.association]) == 1: print '---Asso dup' P_Source2 = self.node[P_nodeSource][self.mapping] P_Target2 = self.node[P_nodeTarget][self.mapping] self.node[P_nodeTarget][self.mapping] = Union(P_Source2, P_Target2) #--- if len(self.node[P_nodeTarget][self.mapping])>self.NumberOfProtein_InList: print len(P_Source2), len(P_Target2) if checkDup(self.node[P_nodeTarget][self.mapping]) ==1: print '!!! mapping Dup' #print 'All protiens=',self.NumberOfAllProtein, 'All white protien#=',len(self.node[P_nodeTarget][self.association]) #print 'whiteDraw=',len(self.node[P_nodeTarget][self.mapping]), 'Alldraw = ',self.NumberOfProtein_InList newPV = hyperGeo_Pvalue(len(self.node[P_nodeTarget][self.mapping]),self.NumberOfProtein_InList, len(self.node[P_nodeTarget][self.association]),self.NumberOfAllProtein) self.node[P_nodeTarget][self.PV] = newPV self.TotalLost += L_lost Children_S = self.predecessors(P_nodeSource) Children_T = self.predecessors(P_nodeTarget) C_S_sub_T = Difference(Children_T,Children_S) #print 'Children of S not of T=',C_S_sub_T for proteinTp in C_S_sub_T: self.add_edge(proteinTp,P_nodeTarget) self.edge[proteinTp][P_nodeTarget]['weight'] = self.edge[proteinTp][P_nodeSource]['weight'] + self.edge[P_nodeSource][P_nodeTarget]['weight'] self.remove_node(P_nodeSource) def merge_nodes2(self, P_nodeSource, P_nodeTarget): ''' merge association protien in source node into target node, update the P_value of the target node, and delete source node from the graph ''' L_lost = self. calculate_lost(P_nodeSource, P_nodeTarget) P_Source = self.node[P_nodeSource][self.association] P_Target = self.node[P_nodeTarget][self.association] self.node[P_nodeTarget][self.association] = Union(P_Source, P_Target) #--- if checkDup(self.node[P_nodeTarget][self.association]) == 1: print '---Asso dup' P_Source2 = self.node[P_nodeSource][self.mapping] P_Target2 = self.node[P_nodeTarget][self.mapping] self.node[P_nodeTarget][self.mapping] = Union(P_Source2, P_Target2) #--- if len(self.node[P_nodeTarget][self.mapping])>self.NumberOfProtein_InList: print len(P_Source2), len(P_Target2) if checkDup(self.node[P_nodeTarget][self.mapping]) ==1: print '!!! mapping Dup' #print 'All protiens=',self.NumberOfAllProtein, 'All white protien#=',len(self.node[P_nodeTarget][self.association]) #print 'whiteDraw=',len(self.node[P_nodeTarget][self.mapping]), 'Alldraw = ',self.NumberOfProtein_InList newPV = hyperGeo_Pvalue(len(self.node[P_nodeTarget][self.mapping]),self.NumberOfProtein_InList, len(self.node[P_nodeTarget][self.association_acc]),self.NumberOfAllProtein) self.node[P_nodeTarget][self.PV] = newPV self.TotalLost += L_lost Children_S = self.predecessors(P_nodeSource) Children_T = self.predecessors(P_nodeTarget) C_S_sub_T = Difference(Children_T,Children_S) #print 'Children of S not of T=',C_S_sub_T for proteinTp in C_S_sub_T: self.add_edge(proteinTp,P_nodeTarget) self.edge[proteinTp][P_nodeTarget]['weight'] = self.edge[proteinTp][P_nodeSource]['weight'] + self.edge[P_nodeSource][P_nodeTarget]['weight'] self.remove_node(P_nodeSource) def findNodemWithMaxPvalue(self): ''' search the goterm structure and retrun number of nodes that is not empty and the node with maximum P_value ''' L_maxPV = -1 L_NonEmpty = 0 L_NodeMax = 'non' for nodeTp in self.nodes(): if self.node[nodeTp][self.PV]>L_maxPV and nodeTp!='GO:0008150': L_maxPV = self.node[nodeTp][self.PV] L_NodeMax = nodeTp if len(self.node[nodeTp][self.mapping])>0: L_NonEmpty += 1 return [L_NonEmpty,L_NodeMax,L_maxPV] def findNodemWithMaxPvalue_Leave(self): ''' search the goterm structure and retrun number of nodes that is not empty and the node with maximum P_value ''' L_maxPV = -1 L_NonEmpty = 0 L_NodeMax = 'non' for nodeTp in self.nodes(): if self.node[nodeTp][self.PV]>L_maxPV and nodeTp!='GO:0008150' and len(self.predecessors(nodeTp)) == 0: L_maxPV = self.node[nodeTp][self.PV] L_NodeMax = nodeTp if len(self.node[nodeTp][self.mapping])>0: L_NonEmpty += 1 return [L_NonEmpty,L_NodeMax,L_maxPV] def findNodemWithMaxPvalue2(self,P_minProteinNo): ''' search the goterm structure and retrun number of nodes that is not empty and the node with maximum P_value ''' L_maxPV = -1 L_NonEmpty = 0 L_NodeMax = 'non' for nodeTp in self.nodes(): if self.node[nodeTp][self.PV]>L_maxPV and len(self.node[nodeTp][self.mapping])0: L_NonEmpty += 1 return [L_NonEmpty,L_NodeMax,L_maxPV] def printResult(self): ''' print 'The total information lost is:', self.TotalLost print 'The final goterm and associated protein list:' for gotp in self.nodes(): if len(self.node[gotp][self.mapping])>0: print gotp,':',self.node[gotp][self.mapping] ''' L_Goterms = {} for gotp in self.nodes(): if len(self.node[gotp][self.mapping])>0: L_Goterms[gotp] = [self.node[gotp][self.mapping],self.node[gotp]['level']] return [self.TotalLost,L_Goterms] def printResult_Leave(self): ''' print 'The total information lost is:', self.TotalLost print 'The final goterm and associated protein list:' for gotp in self.nodes(): if len(self.node[gotp][self.mapping])>0: print gotp,':',self.node[gotp][self.mapping] ''' L_Goterms = {} for gotp in self.nodes(): if len(self.node[gotp][self.mapping])>0: if len(self.predecessors(gotp)) == 0: L_Goterms[gotp] = [self.node[gotp][self.mapping],self.node[gotp]['level']] return [self.TotalLost,L_Goterms] def gotermSummarization_1(self, P_geneList, P_value): ''' use P_GotermNumber to summarizate a given list of proteins ''' self.node_Mapping_ProteinList_PV2(P_geneList) self.symplify_Goterm_Structure() Status = self.findNodemWithMaxPvalue() while Status[0]>1 and Status[2]>P_value: try: L_parrents = self.successors(Status[1]) L_MinWeight = 100000;L_MinPa = 'non' for nodetp in L_parrents: if self.edge[Status[1]][nodetp]['weight']P_value: try: L_parrents = self.successors(Status[1]) L_MinWeight = 100000;L_MinPa = 'non' for nodetp in L_parrents: if self.edge[Status[1]][nodetp]['weight']'+str(P_result[1][item][1])+'>' for gene in P_result[1][item][0]: output += gene+':' output += '\n' output += 'End\n\n' return output ''' ------------------------------------------------------------------------------------------------------ #Example of using the class GoGraph: #Data of Go Ontology structure and gene_Goterm association weightGographData = 'newWeightedPubMedGO.xml' geneGotermAssociationData = 'gene_association.sgd' #Create a GoGraph object (Node: every time you use the gotermSummarization(), you need to create a new object) G = GoGraph(weightGographData,geneGotermAssociationData) #A list of genes are summarized GeneList = ['ADH4', 'AFR1', 'AGA1', 'AGA2', 'AQR1', 'ARO10', 'ASG7', 'ASP3-3', 'AXL1', 'BAR1', 'BEM2', 'BNA1', 'CDC20', 'CDC24', 'CHS1', 'CHS7', 'CIK1', 'CLB1', 'CPR8', 'DDR48', 'ECM18', 'ENA1', 'ERG24', 'ERG6', 'FAA3', 'FAR1', 'FIG1', 'FIG2', 'FIT2', 'FRE2', 'FUS1', 'FUS2', 'FUS3', 'GAS2', 'GAT1', 'GFA1', 'GIC2', 'GIT1', 'GPA1', 'GSC2', 'GYP8', 'HOF1', 'HOR2', 'HO', 'HSP31', 'HST3', 'HYM1', 'ICL1', 'ICS2', 'IDH1', 'KAR4', 'KAR5', 'KRE6', 'KSS1', 'KTR2', 'LEU2', 'LSB3', 'LYS2', 'MDJ2', 'MET1', 'MID2', 'MNN1', 'MNT2', 'MRH1', 'MSB2', 'MSG5', 'NDE1', 'NDJ1', 'NRM1', 'PCL2', 'PCL7', 'PCL9', 'PET9', 'PGU1', 'PHD1', 'PHO89', 'PRM10', 'PRM1', 'PRM2', 'PRM3', 'PRM4', 'PRM5', 'PRM6', 'PRM8', 'PRM9', 'PRP39', 'PRY2', 'PST1', 'RAX2', 'RDI1', 'RHR2', 'SAM35', 'SCW10', 'SIL1', 'SLI15', 'SRL1', 'SRL3', 'SST2', 'STE12', 'STE2', 'STE4', 'SUP45', 'SVS1', 'TAT1', 'TGS1', 'TIP1', 'TYE7', 'UTR2', 'WSC2', 'WSC3', 'YAR009C', 'YAR068W', 'YBR012W-B', 'YBR071W', 'YCL056C', 'YDR124W', 'YDR249C', 'YDR366C', 'YER138C', 'YER158C', 'YER160C', 'YGR122W', 'YHB1', 'YIL080W', 'YIL082W-A', 'YIL083C', 'YIL169C', 'YJR027W', 'YJR029W', 'YLL054C', 'YLR042C', 'YLR108C', 'YLR414C', 'YML039W', 'YML119W', 'YMR045C', 'YMR050C', 'YNL208W', 'YRO2', 'ZIP1'] #Using Go term to summarize the list of gene. Result = G.gotermSummarization(GeneList,0.05,5) #0.05 is the threshold of P_Value of the Go term node in final result. #5 is the minimum number of genes in the Go term node in final result. #The result has the format: [value_0,{Goterm_1:[a list of genes_1,value_1],Goterm_2:[a list of genes_2,value_2],....}] # value_0: the total information lost for the summarization. # Goterm_1: Goterm ID that is used to summarize the given list of gene. # a list of genes_1: a subset of genes (in the given list of gene) that are annotated by Goterm_1. # value_1: the level of Goterm_1 on the Go Ontology. The root note is in level 1. ------------------------------------------------------------------------------ '''