#!/usr/bin/python import os, csv, sys, math, time import xml.dom.minidom import networkx #In this program, associated proteins will include proteins that associated theit children def checkDup(P_list): PTp = P_list PTp.sort() for i in (range(len(PTp)-1)): if PTp[i]==PTp[i+1]: return 1 return 0 def hyperGeo_pr(p_x,p_k,p_m,p_n): """Function to calculate the probability input: p_x: number of white balls drawn. p_k: number of balls drawn (note p_x is not larger than p_k). p_m: number of white balls in the urn. p_n: number of black balls in the urn. output: the probability that has p_x white balls in a drawing of p_k balls. """ l_nue = [] l_deno = [] for j in range(p_x): l_nue.append(p_m-j) l_deno.append(j+1) for j in range(p_k-p_x): l_nue.append(p_n-j) l_deno.append(j+1) for j in range(p_k): l_deno.append(p_m+p_n-j) l_nue.append(j+1) l_deno.sort() l_nue.sort() #l_p = 1.0 l_p2 = 0.0 try: for j in range(len(l_deno)): #l_p = l_p*l_nue[j]/l_deno[j] l_p2 += math.log(l_nue[j]) - math.log(l_deno[j]) except: return 0 #return math.exp(l_p) return math.exp(l_p2) def hyperGeo_Pvalue(white_ball_drawn,total_ball_drawn,white_ball_in_urn,total_ball_in_urn): """Function to calculate the p_value input: white_ball_drawn: number of white balls drawn. white_ball_in_urn: number of white balls in the urn. total_ball_in_urn: number of black balls in the urn. total_ball_drawn: number of balls drawn (note white_ball_drawn is not larger than total_ball_drawn). output: 1-the probability that has at least white_ball_drawn white balls in a drawing of total_ball_drawn balls. """ l_p = 0.0 for j in range(white_ball_drawn): hptp = hyperGeo_pr(j,total_ball_drawn,white_ball_in_urn,total_ball_in_urn-white_ball_in_urn) l_p += hptp if l_p>1: l_p = 1.0 return (1-l_p) def Union(P_list1, P_list2): L_rets = [] L_dic = {} for item in P_list1: L_rets.append(item) L_dic[item] = 1 for item in P_list2: if not(L_dic.has_key(item)): L_rets.append(item) L_rets.sort() return L_rets def Intersection(P_list1, P_list2): L_rets = [] L_dic = {} for item in P_list1: L_dic[item] = 1 for item in P_list2: if L_dic.has_key(item): L_rets.append(item) L_rets.sort() return L_rets def Difference(P_list1, P_list2): #return P_list2 - P_list1 L_rets = [] L_dic = {} for item in P_list1: L_dic[item] = 1 for item in P_list2: if not(L_dic.has_key(item)): L_rets.append(item) L_rets.sort() return L_rets class GotermEnrich(networkx.DiGraph): def __init__(self,P_weightGraph, P_Associate): networkx.DiGraph.__init__(self) self.association = 'Association_proteins' self.association_acc = 'Association_Accumulation' self.mapping = 'mapping_protiens' self.PV = 'P_value' self.TotalLost = 0.0 self.gene2Goterm = self.Gene2GoTermAssociation(P_Associate) self.Goterm2gene = self.GoTerm2GeneAssociation(P_Associate) self.NumberOfAllProtein = len(self.gene2Goterm) self.createDiGraph(P_weightGraph) def readWeights(self, P_filename = ''): ''' read edge-weighted go structure data ''' xmldoc = xml.dom.minidom.parse(P_filename) xmlData = xmldoc.firstChild goTermList = xmlData.getElementsByTagName('term') goGraphRe = {} count =0 for term in goTermList: count += 0 if count<10: go = term.attributes['id'] goGraphRe[go.value] = {} parentList = term.getElementsByTagName('parent') for par in parentList: parGO = par.attributes['id'] goGraphRe[go.value][parGO.value] = {} edgeList = par.getElementsByTagName('edge') for edge in edgeList: edgeType = edge.attributes['type'] goGraphRe[go.value][parGO.value] = float(edge.firstChild.data) #print go.value, parGO.value, float(edge.firstChild.data) return goGraphRe def createDiGraph(self, P_filename): ''' create edge-weighted Goterm structure ''' GoStructure = self.readWeights(P_filename) for chtp in GoStructure: for pa in GoStructure[chtp]: self.add_edge(chtp,pa) self.edge[chtp][pa]['weight'] = GoStructure[chtp][pa] self.node_AssociationProtienData() self.node_Depth() def Gene2GoTermAssociation(self, P_fName): ''' create gene-goterm association relation ''' L_rets = {} L_retNoDup = {} L_fileLipid = open(P_fName) for line in L_fileLipid: line2 = line.split('\t') if len(line2)>10: if L_rets.has_key(line2[2]): L_rets[line2[2]][line2[4]] = 1 else: L_rets[line2[2]] = {line2[4]:1} for gene in L_rets: GList = [] for Gtemp in L_rets[gene]: GList.append(Gtemp) L_retNoDup[gene] = GList return L_retNoDup def GoTerm2GeneAssociation(self, P_fName): ''' create goterm-gene association relation ''' L_rets = {} L_retNoDup = {} L_fileLipid = open(P_fName) for line in L_fileLipid: line2 = line.split('\t') if len(line2)>10: if L_rets.has_key(line2[4]): L_rets[line2[4]][line2[2]] = 1 else: L_rets[line2[4]] = {line2[2]:1} for Gtp in L_rets: GeneList = [] for gene in L_rets[Gtp]: GeneList.append(gene) L_retNoDup[Gtp] = GeneList return L_retNoDup def propagate_AssociatedProteins(self): Top_sort_nodes = networkx.topological_sort(self) for nodeTp in Top_sort_nodes: ParentTp = self.successors(nodeTp) AssoSelf = self.node[nodeTp][self.association_acc] for nodePare in ParentTp: AssoPare = self.node[nodePare][self.association_acc] #print nodeTp,nodePare,AssoSelf,AssoPare UnionTwo = Union(AssoSelf,AssoPare) self.node[nodePare][self.association_acc] = UnionTwo def node_AssociationProtienData(self): for gotermTp in self.nodes(): self.node[gotermTp]['level'] = 1000000000 if self.Goterm2gene.has_key(gotermTp): self.node[gotermTp][self.association] = self.Goterm2gene[gotermTp] self.node[gotermTp][self.association_acc] = self.Goterm2gene[gotermTp] else: self.node[gotermTp][self.association] = [] self.node[gotermTp][self.association_acc] = [] self.node[gotermTp][self.mapping] = [] self.propagate_AssociatedProteins() def node_Depth(self): Top_sort_nodes = networkx.topological_sort(self) OrderReverse=[] for i in range(len(Top_sort_nodes)): OrderReverse.append(Top_sort_nodes[len(Top_sort_nodes)-i-1]) self.node[OrderReverse[0]]['level'] = 1 for nodeTp in OrderReverse: ChildrenTp = self.predecessors(nodeTp) for chd in ChildrenTp: if self.node[chd]['level']>self.node[nodeTp]['level']+1: self.node[chd]['level']=self.node[nodeTp]['level']+1 def symplify_Goterm_StructureNoDeletion(self,P_pv,P_size): Top_Order = networkx.topological_sort(self) for nodeTp in Top_Order: Children = self.predecessors(nodeTp) self.node[nodeTp]['leaf'] = 2 tempSt = 1 for child in Children: if self.node[child]['leaf'] > 0: tempSt = 0;break if tempSt == 1: if len(self.node[nodeTp][self.mapping])>=P_size and self.node[nodeTp][self.PV]<=P_pv: self.node[nodeTp]['leaf'] = 1 else: self.node[nodeTp]['leaf'] = 0 def printResult(self, P_pvThreshold): L_Goterms = {} for gotp in self.nodes(): if len(self.node[gotp][self.mapping])>0 and self.node[gotp][self.PV]<=P_pvThreshold: L_Goterms[gotp] = [self.node[gotp][self.mapping],self.node[gotp]['level'],self.node[gotp][self.PV]] return L_Goterms def Goterm_enrich(self, P_protein_InList, P_pvThreshold, P_topRank,P_size): Gene_List_Associated = [] PV_Score = {} L_retTemp =[] for gene in P_protein_InList: if self.gene2Goterm.has_key(gene): Gene_List_Associated.append(gene) self.NumberOfProtein_InList = len(Gene_List_Associated) for nodeID in self.nodes(): P_Association=self.node[nodeID][self.association] P_Association2=self.node[nodeID][self.association_acc] P_protein = Intersection(P_protein_InList,P_Association2) self.node[nodeID][self.mapping] = P_protein newPV = hyperGeo_Pvalue(len(P_protein),self.NumberOfProtein_InList, len(P_Association2),self.NumberOfAllProtein) self.node[nodeID][self.PV] = newPV ''' if newPV<=P_pvThreshold: L_retTemp.append([nodeID,P_protein,newPV]) PV_Score[newPV]=1 ''' #--- if checkDup(P_Association)==1: print 'Asso dup' if checkDup(P_protein_InList)==1: print 'PList dup' if checkDup(P_protein)==1: print 'Intersection dup' self.symplify_Goterm_StructureNoDeletion(P_pvThreshold,P_size) for nodeID in self.nodes(): if self.node[nodeID]['leaf'] == 1: L_retTemp.append([nodeID,self.node[nodeID][self.mapping],self.node[nodeID][self.PV],self.node[nodeID]['level']]) PV_Score[self.node[nodeID][self.PV]]=1 PV_ScoreSort = [] for item in PV_Score: PV_ScoreSort.append(item) PV_ScoreSort.sort() idxTp = P_topRank-1 if idxTp>=len(PV_ScoreSort)-1: idxTp = len(PV_ScoreSort)-1 ScoreUp = -1 if idxTp>0: ScoreUp = PV_ScoreSort[idxTp] L_ret = [] for item in L_retTemp: if item[2]<=ScoreUp: L_ret.append(item) #print item return L_ret ''' #Example of using class GotermEnrich #Generate an object of the GotermEnrich class weightGographData = './data/newWeightedPubMedGO.xml' geneGotermAssociationData = './data/gene_association.sgd' G = GotermEnrich(weightGographData,geneGotermAssociationData) #two list needed to be processed list1= ['FUS3', 'RAD6', 'STE24', 'DIG2', 'DIG1', 'STE11', 'STE12', 'STE7', 'STE4', 'STE5', 'STE18', 'KSS1'] list2= ['YER044C', 'ERG3', 'CMD1', 'ERG11', 'YMR031W-A', 'SWI4', 'TUNICAMYCIN'] #call the Go term enriched function (Parameters: list1--a list of genes; 0.05--the PValue threshold; # 2--number of top score (PValue) Go terms that are outputed. 5--Minimum number of genes in the Go terms of solutions) #The return value of Goterm_enrich() is a list, where each element is also a list # [goterm ID, a list of genes annotated with this goterm, PValue, the level of go term in the Ontology structure]. result1 = G.Goterm_enrich(list1,0.05,2,5) result2 = G.Goterm_enrich(list2,0.05,2,3) '''