You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cyclicPatternKernel.py 6.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. """
  2. @author: linlin <jajupmochi@gmail.com>
  3. @references:
  4. [1] Tamás Horváth, Thomas Gärtner, and Stefan Wrobel. Cyclic pattern kernels for predictive graph mining. In Proceedings of the tenth ACM SIGKDD international conference on Knowledge discovery and data mining, pages 158–167. ACM, 2004.
  5. [2] Hopcroft, J.; Tarjan, R. (1973). “Efficient algorithms for graph manipulation”. Communications of the ACM 16: 372–378. doi:10.1145/362248.362272.
  6. [3] Finding all the elementary circuits of a directed graph. D. B. Johnson, SIAM Journal on Computing 4, no. 1, 77-84, 1975. http://dx.doi.org/10.1137/0204007
  7. """
  8. import sys
  9. import pathlib
  10. sys.path.insert(0, "../")
  11. import time
  12. import networkx as nx
  13. import numpy as np
  14. from tqdm import tqdm
  15. def cyclicpatternkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled = True, cycle_bound = None):
  16. """Calculate cyclic pattern graph kernels between graphs.
  17. Parameters
  18. ----------
  19. Gn : List of NetworkX graph
  20. List of graphs between which the kernels are calculated.
  21. /
  22. G1, G2 : NetworkX graphs
  23. 2 graphs between which the kernel is calculated.
  24. node_label : string
  25. node attribute used as label. The default node label is atom.
  26. edge_label : string
  27. edge attribute used as label. The default edge label is bond_type.
  28. labeled : boolean
  29. Whether the graphs are labeled. The default is True.
  30. depth : integer
  31. Depth of search. Longest length of paths.
  32. Return
  33. ------
  34. Kmatrix : Numpy matrix
  35. Kernel matrix, each element of which is the path kernel up to d between 2 praphs.
  36. """
  37. Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list
  38. Kmatrix = np.zeros((len(Gn), len(Gn)))
  39. start_time = time.time()
  40. # get all cyclic and tree patterns of all graphs before calculating kernels to save time, but this may consume a lot of memory for large dataset.
  41. all_patterns = [ get_patterns(Gn[i], node_label=node_label, edge_label = edge_label, labeled = labeled, cycle_bound = cycle_bound)
  42. for i in tqdm(range(0, len(Gn)), desc='retrieve patterns', file=sys.stdout) ]
  43. for i in tqdm(range(0, len(Gn)), desc='calculate kernels', file=sys.stdout):
  44. for j in range(i, len(Gn)):
  45. Kmatrix[i][j] = _cyclicpatternkernel_do(all_patterns[i], all_patterns[j])
  46. Kmatrix[j][i] = Kmatrix[i][j]
  47. run_time = time.time() - start_time
  48. print("\n --- kernel matrix of cyclic pattern kernel of size %d built in %s seconds ---" % (len(Gn), run_time))
  49. return Kmatrix, run_time
  50. def _cyclicpatternkernel_do(patterns1, patterns2):
  51. """Calculate path graph kernels up to depth d between 2 graphs.
  52. Parameters
  53. ----------
  54. paths1, paths2 : list
  55. List of paths in 2 graphs, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path.
  56. k_func : function
  57. A kernel function used using different notions of fingerprint similarity.
  58. node_label : string
  59. node attribute used as label. The default node label is atom.
  60. edge_label : string
  61. edge attribute used as label. The default edge label is bond_type.
  62. labeled : boolean
  63. Whether the graphs are labeled. The default is True.
  64. Return
  65. ------
  66. kernel : float
  67. Treelet Kernel between 2 graphs.
  68. """
  69. return len(set(patterns1) & set(patterns2))
  70. def get_patterns(G, node_label = 'atom', edge_label = 'bond_type', labeled = True, cycle_bound = None):
  71. """Find all cyclic and tree patterns in a graph.
  72. Parameters
  73. ----------
  74. G : NetworkX graphs
  75. The graph in which paths are searched.
  76. length : integer
  77. The maximum length of paths.
  78. node_label : string
  79. node attribute used as label. The default node label is atom.
  80. edge_label : string
  81. edge attribute used as label. The default edge label is bond_type.
  82. labeled : boolean
  83. Whether the graphs are labeled. The default is True.
  84. Return
  85. ------
  86. path : list
  87. List of paths retrieved, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path.
  88. """
  89. number_simplecycles = 0
  90. bridges = nx.Graph()
  91. patterns = []
  92. bicomponents = nx.biconnected_component_subgraphs(G) # all biconnected components of G. this function use algorithm in reference [2], which (i guess) is slightly different from the one used in paper [1]
  93. for subgraph in bicomponents:
  94. if nx.number_of_edges(subgraph) > 1:
  95. simple_cycles = list(nx.simple_cycles(G.to_directed())) # all simple cycles in biconnected components. this function use algorithm in reference [3], which has time complexity O((n+e)(N+1)) for n nodes, e edges and N simple cycles. Which might be slower than the algorithm applied in paper [1]
  96. if cycle_bound != None and len(simple_cycles) > cycle_bound - number_simplecycles: # in paper [1], when applying another algorithm (subroutine RT), this becomes len(simple_cycles) == cycle_bound - number_simplecycles + 1, check again.
  97. return []
  98. else:
  99. # calculate canonical representation for each simple cycle
  100. all_canonkeys = []
  101. for cycle in simple_cycles:
  102. canonlist = [ G.node[node][node_label] + G[node][cycle[cycle.index(node) + 1]][edge_label] for node in cycle[:-1] ]
  103. canonkey = ''.join(canonlist)
  104. canonkey = canonkey if canonkey < canonkey[::-1] else canonkey[::-1]
  105. for i in range(1, len(cycle[:-1])):
  106. canonlist = [ G.node[node][node_label] + G[node][cycle[cycle.index(node) + 1]][edge_label] for node in cycle[i:-1] + cycle[:i] ]
  107. canonkey_t = ''.join(canonlist)
  108. canonkey_t = canonkey_t if canonkey_t < canonkey_t[::-1] else canonkey_t[::-1]
  109. canonkey = canonkey if canonkey < canonkey_t else canonkey_t
  110. all_canonkeys.append(canonkey)
  111. patterns = list(set(patterns) | set(all_canonkeys))
  112. number_simplecycles += len(simple_cycles)
  113. else:
  114. bridges.add_edges_from(subgraph.edges(data=True))
  115. # calculate canonical representation for each connected component in bridge set
  116. components = list(nx.connected_component_subgraphs(bridges)) # all connected components in the bridge
  117. tree_patterns = []
  118. for tree in components:
  119. break
  120. # patterns += pi(bridges)
  121. return patterns

A Python package for graph kernels, graph edit distances and graph pre-image problem.