#!/usr/bin/python import os, csv, sys, time, re class DenseSubGraph(): def __init__(self): ''' Initialize the class object geneList -- a dictionary that each key is an instance ID, such as a deleted gene or a patient's TCGA id, each eletmen is a list of co-expressed genes ''' self.__base = ['a','b','c','d','e'] def __k_subsets_i(self, n, k): ''' Yield each subset of size k from the set of intergers 0 .. n - 1 n -- an integer > 0 k -- an integer > 0 ''' # Validate args if n < 0: raise ValueError('n must be > 0, got n=%d' % n) if k < 0: raise ValueError('k must be > 0, got k=%d' % k) # check base cases if k == 0 or n < k: yield set() elif n == k: yield set(range(n)) else: # Use recursive formula based on binomial coeffecients: # choose(n, k) = choose(n - 1, k - 1) + choose(n - 1, k) for s in self.__k_subsets_i(n - 1, k - 1): s.add(n - 1) yield s for s in self.__k_subsets_i(n - 1, k): yield s def __k_subsets(self, s, k): ''' Yield all subsets of size k from set (or list) s s -- a set or list (any iterable will suffice) k -- an integer > 0 ''' s = list(s) n = len(s) for k_set in self.__k_subsets_i(n, k): yield set([s[i] for i in k_set]) def __makeMatrix(self, P_mRow): ''' Create a bipartite graph from self.__mInstances using a matrix P_mRow -- the minimum instance number that a gene is repeatedly used ''' L_matrix = [] L_allInstances = [] L_allGenes = [] geneTotal = {} for inst in self.__mInstances: for geneTp in self.__mInstances[inst]: if geneTotal.has_key(geneTp): geneTotal[geneTp] += 1 else: geneTotal[geneTp] = 1 for item in geneTotal: if geneTotal[item]>=P_mRow: L_allGenes.append(item) L_allGenes.sort() for inst in self.__mInstances: L_allInstances.append(inst) geneSub = {} for geneTp in self.__mInstances[inst]: geneSub[geneTp] = 1 matRow = [] for geneTp in L_allGenes: if geneSub.has_key(geneTp): matRow.append(1) else: matRow.append(0) L_matrix.append(matRow) return [L_allInstances,L_allGenes,L_matrix] def __getSubSolutionScore(self, P_rNo, P_record, P_ratio): ''' Calculate the score of a subsolution P_rNo -- the number of instances P_record -- a list to remember the number that a gene has been used ''' L_Score = 0 for item in P_record: if item >= P_rNo*P_ratio: L_Score += (P_rNo - 1/(1-P_ratio)*(P_rNo-item) + 0.001) return L_Score def Find_Subgraph(self, geneList): L_TFAll = {} L_TFlist = [] L_comGenes = [] L_instances = [] L_Score = 0 self.__mInstances = geneList P_mRow = 3 MatTp = self.__makeMatrix(P_mRow) instances = range(len(MatTp[0])) ''' print '------------------Matrix' for row in MatTp[2]: print row print instances ''' if len(instances)>=4: subsets = self.__k_subsets(instances,4) for sset in subsets: commonGenesTp = [0]*len(MatTp[1]) subSolution = list(sset) selectPool = range(max(subSolution)+1,len(MatTp[0])) #print subSolution,'\n',selectPool for idx in subSolution: for idx2 in range(len(commonGenesTp)): commonGenesTp[idx2] += MatTp[2][idx][idx2] #print commonGenesTp, '\n-----------------------------' status = 1 ScoreTp = self.__getSubSolutionScore(len(subSolution),commonGenesTp,0.75) while status>0: status = 0 tempScore = -1 tempRow = -1 for tpRow in selectPool: tempCommonGene = [0]*len(MatTp[1]) for idx in range(len(commonGenesTp)): tempCommonGene[idx] = commonGenesTp[idx] + MatTp[2][tpRow][idx] tempScore2 = self.__getSubSolutionScore(len(subSolution)+1,tempCommonGene,0.75) if tempScore2 > tempScore: tempScore = tempScore2 tempRow = tpRow if tempScore>ScoreTp: status = 1 ScoreTp = tempScore subSolution.append(tempRow) selectPool.remove(tempRow) for idx in range(len(commonGenesTp)): commonGenesTp[idx] += MatTp[2][tempRow][idx] if ScoreTp > L_Score: L_Score = ScoreTp L_instances = [] for item in subSolution: L_instances.append(MatTp[0][item]) L_comGenes = [] for idx in range(len(commonGenesTp)): if commonGenesTp[idx]>=len(subSolution)*0.75: L_comGenes.append(MatTp[1][idx]) #print 'coexpressed=', L_comGenes #print 'instance=', L_instances #print 'score=',L_Score L_instancesUpdate = [] L_geneName2idx = {} for idx in range(len(MatTp[1])): L_geneName2idx[MatTp[1][idx]] = idx for idxInst in range(len(MatTp[0])): countDegree = 0 for gene in L_comGenes: if MatTp[2][idxInst][L_geneName2idx[gene]]==1: countDegree += 1 if countDegree>=len(L_comGenes)*0.75: L_instancesUpdate.append(MatTp[0][idxInst]) return [L_instances,L_comGenes] def test(self): subsets = self.__k_subsets(self.__base,2) for item in subsets: print item ''' #Example of using the class DenseSubGraph #Create an object of the class DenseSubGraph Object = DenseSubGraph() #Define a bipartite graph BiGraph = {'RPS24A(**9)': ['AIR1', 'CLN2', 'GLN1', 'IMP2', 'LSB1', 'PGI1', 'PRK1', 'PTK2', 'RGT1', 'RPB5', 'RVS167', 'SRV2', 'SSA1', 'UBX6'], 'FKS1(HAPLOID)': ['ABP1', 'ACT1', 'AKL1', 'ARG80', 'BUL1', 'CLN2', 'GYL1', 'HUA1', 'LAS17', 'LSB1', 'PAN1', 'PTK2', 'RHO1', 'RPC40', 'RVS167', 'SRV2', 'TOR2', 'UBP5', 'UTP9', 'VRP1', 'YPR078C'], 'RPS24A(HAPLOID)': ['AIP1', 'DED1', 'GCN4', 'GLK1', 'GYP5', 'MCM1', 'PTK2', 'SAC6', 'SLA2', 'SSA1', 'YSC84'], 'KAR2(TET PROMOTER)': ['ABP1', 'ACT1', 'AIR1', 'AKL1', 'CBK1', 'CLN2', 'CLN3', 'CRN1', 'GTS1', 'GYP5', 'HRD1', 'HUA1', 'IMP2', 'LSB1', 'MMS2', 'MSB1', 'MYO3', 'PAN1', 'PTK2', 'RGT1', 'RPB5', 'RPC40', 'RUP1', 'RVS167', 'SAC6', 'SRV2', 'TWF1', 'UBP5', 'UTP9', 'YSC84'], 'FKS1(TET PROMOTER)': ['ABP1', 'CLN2', 'CLN3', 'FKS1', 'LSB1', 'MYO3', 'PTK2', 'SCD5', 'SRV2', 'UBP5', 'YPR078C'], 'CDC42(TET PROMOTER)': ['ABP1', 'ACT1', 'AKL1', 'APC5', 'ARC15', 'BEM2', 'BUL1', 'CBK1', 'CDC3', 'CDC42', 'CLN2', 'CLN3', 'CRN1', 'FCY2', 'FKS1', 'HUA1', 'LSB1', 'MCM6', 'MMS2', 'MYO3', 'PRO3', 'PTK2', 'RPB3', 'RPB5', 'RPC40', 'SCP1', 'SLA2', 'SLG1', 'SRV2', 'STE12', 'SUC2', 'TWF1', 'UBP5', 'UTP9', 'YPR078C', 'YSC84'], 'SHE4': ['ACT1', 'AKL1', 'ARC15', 'ARC18', 'ARC35', 'ARC40', 'CAP1', 'CAP2', 'CLN2', 'DUN1', 'ENT1', 'ERB1', 'FCY2', 'FKS1', 'GYP5', 'HXT6', 'MCM6', 'MMS2', 'MYO3', 'PHO84', 'RHO1', 'RPA43', 'RPC40', 'RPO21', 'RVS161', 'SAC6', 'SCD5', 'SLA2', 'STT4', 'TWF1', 'UBP5', 'UBX6', 'UTP9', 'VRP1', 'YPR078C'], 'GLUCOSAMINE': ['ABP1', 'ARG80', 'CRN1', 'HXT6', 'PGI1', 'PRO3', 'PTK2', 'SRV2', 'UBP5', 'VRP1', 'YPR078C'], 'RPL12A': ['ABP1', 'CLN2', 'CLN3', 'CRN1', 'FCY2', 'GCN4', 'GTS1', 'HUA1', 'LOT6', 'PTK2', 'RVS167', 'SLA2', 'SRV2', 'UTP9']} #Find the highly dense subgrpah Result = Object.Find_Subgraph(BiGraph) #print Result print 'Nodes in upper set:',Result[0] print 'Nodes in lower set:',Result[1] '''