You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

weisfeilerLehmanKernel.py 23 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549
  1. """
  2. @author: linlin
  3. @references:
  4. [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM.
  5. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research.
  6. 2011;12(Sep):2539-61.
  7. """
  8. import sys
  9. from collections import Counter
  10. sys.path.insert(0, "../")
  11. from functools import partial
  12. import time
  13. #from multiprocessing import Pool
  14. from tqdm import tqdm
  15. import networkx as nx
  16. import numpy as np
  17. #from pygraph.kernels.pathKernel import pathkernel
  18. from pygraph.utils.graphdataset import get_dataset_attributes
  19. from pygraph.utils.parallel import parallel_gm
  20. # @todo: support edge kernel, sp kernel, user-defined kernel.
  21. def weisfeilerlehmankernel(*args,
  22. node_label='atom',
  23. edge_label='bond_type',
  24. height=0,
  25. base_kernel='subtree',
  26. parallel=None,
  27. n_jobs=None,
  28. verbose=True):
  29. """Calculate Weisfeiler-Lehman kernels between graphs.
  30. Parameters
  31. ----------
  32. Gn : List of NetworkX graph
  33. List of graphs between which the kernels are calculated.
  34. /
  35. G1, G2 : NetworkX graphs
  36. 2 graphs between which the kernel is calculated.
  37. node_label : string
  38. node attribute used as label. The default node label is atom.
  39. edge_label : string
  40. edge attribute used as label. The default edge label is bond_type.
  41. height : int
  42. subtree height
  43. base_kernel : string
  44. base kernel used in each iteration of WL kernel. The default base kernel is subtree kernel. For user-defined kernel, base_kernel is the name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
  45. Return
  46. ------
  47. Kmatrix : Numpy matrix
  48. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  49. Notes
  50. -----
  51. This function now supports WL subtree kernel only.
  52. """
  53. # pre-process
  54. base_kernel = base_kernel.lower()
  55. Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list
  56. Gn = [g.copy() for g in Gn]
  57. ds_attrs = get_dataset_attributes(Gn, attr_names=['node_labeled'],
  58. node_label=node_label)
  59. if not ds_attrs['node_labeled']:
  60. for G in Gn:
  61. nx.set_node_attributes(G, '0', 'atom')
  62. start_time = time.time()
  63. # for WL subtree kernel
  64. if base_kernel == 'subtree':
  65. Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose)
  66. # for WL shortest path kernel
  67. elif base_kernel == 'sp':
  68. Kmatrix = _wl_spkernel_do(Gn, node_label, edge_label, height)
  69. # for WL edge kernel
  70. elif base_kernel == 'edge':
  71. Kmatrix = _wl_edgekernel_do(Gn, node_label, edge_label, height)
  72. # for user defined base kernel
  73. else:
  74. Kmatrix = _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel)
  75. run_time = time.time() - start_time
  76. if verbose:
  77. print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---"
  78. % (base_kernel, len(args[0]), run_time))
  79. return Kmatrix, run_time
  80. def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose):
  81. """Calculate Weisfeiler-Lehman kernels between graphs.
  82. Parameters
  83. ----------
  84. Gn : List of NetworkX graph
  85. List of graphs between which the kernels are calculated.
  86. node_label : string
  87. node attribute used as label.
  88. edge_label : string
  89. edge attribute used as label.
  90. height : int
  91. wl height.
  92. Return
  93. ------
  94. Kmatrix : Numpy matrix
  95. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  96. """
  97. height = int(height)
  98. Kmatrix = np.zeros((len(Gn), len(Gn)))
  99. # initial for height = 0
  100. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  101. # for each graph
  102. for G in Gn:
  103. # get the set of original labels
  104. labels_ori = list(nx.get_node_attributes(G, node_label).values())
  105. # number of occurence of each label in G
  106. all_num_of_each_label.append(dict(Counter(labels_ori)))
  107. # calculate subtree kernel with the 0th iteration and add it to the final kernel
  108. compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False)
  109. # iterate each height
  110. for h in range(1, height + 1):
  111. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  112. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  113. # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
  114. all_num_of_each_label = [] # number of occurence of each label in G
  115. # # for each graph
  116. # # ---- use pool.imap_unordered to parallel and track progress. ----
  117. # pool = Pool(n_jobs)
  118. # itr = zip(Gn, range(0, len(Gn)))
  119. # if len(Gn) < 100 * n_jobs:
  120. # chunksize = int(len(Gn) / n_jobs) + 1
  121. # else:
  122. # chunksize = 100
  123. # all_multisets_list = [[] for _ in range(len(Gn))]
  124. ## set_unique_list = [[] for _ in range(len(Gn))]
  125. # get_partial = partial(wrapper_wl_iteration, node_label)
  126. ## if verbose:
  127. ## iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize),
  128. ## desc='wl iteration', file=sys.stdout)
  129. ## else:
  130. # iterator = pool.imap_unordered(get_partial, itr, chunksize)
  131. # for i, all_multisets in iterator:
  132. # all_multisets_list[i] = all_multisets
  133. ## set_unique_list[i] = set_unique
  134. ## all_set_unique = all_set_unique | set(set_unique)
  135. # pool.close()
  136. # pool.join()
  137. # all_set_unique = set()
  138. # for uset in all_multisets_list:
  139. # all_set_unique = all_set_unique | set(uset)
  140. #
  141. # all_set_unique = list(all_set_unique)
  142. ## # a dictionary mapping original labels to new ones.
  143. ## set_compressed = {}
  144. ## for idx, uset in enumerate(all_set_unique):
  145. ## set_compressed.update({uset: idx})
  146. #
  147. # for ig, G in enumerate(Gn):
  148. #
  149. ## # a dictionary mapping original labels to new ones.
  150. ## set_compressed = {}
  151. ## # if a label occured before, assign its former compressed label,
  152. ## # else assign the number of labels occured + 1 as the compressed label.
  153. ## for value in set_unique_list[i]:
  154. ## if uset in all_set_unique:
  155. ## set_compressed.update({uset: all_set_compressed[value]})
  156. ## else:
  157. ## set_compressed.update({value: str(num_of_labels_occured + 1)})
  158. ## num_of_labels_occured += 1
  159. #
  160. ## all_set_compressed.update(set_compressed)
  161. #
  162. # # relabel nodes
  163. # for idx, node in enumerate(G.nodes()):
  164. # G.nodes[node][node_label] = all_set_unique.index(all_multisets_list[ig][idx])
  165. #
  166. # # get the set of compressed labels
  167. # labels_comp = list(nx.get_node_attributes(G, node_label).values())
  168. ## all_labels_ori.update(labels_comp)
  169. # all_num_of_each_label[ig] = dict(Counter(labels_comp))
  170. # all_set_unique = list(all_set_unique)
  171. # @todo: parallel this part.
  172. for idx, G in enumerate(Gn):
  173. all_multisets = []
  174. for node, attrs in G.nodes(data=True):
  175. # Multiset-label determination.
  176. multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]]
  177. # sorting each multiset
  178. multiset.sort()
  179. multiset = [attrs[node_label]] + multiset # add the prefix
  180. all_multisets.append(tuple(multiset))
  181. # label compression
  182. set_unique = list(set(all_multisets)) # set of unique multiset labels
  183. # a dictionary mapping original labels to new ones.
  184. set_compressed = {}
  185. # if a label occured before, assign its former compressed label,
  186. # else assign the number of labels occured + 1 as the compressed label.
  187. for value in set_unique:
  188. if value in all_set_compressed.keys():
  189. set_compressed.update({value: all_set_compressed[value]})
  190. else:
  191. set_compressed.update({value: str(num_of_labels_occured + 1)})
  192. num_of_labels_occured += 1
  193. all_set_compressed.update(set_compressed)
  194. # relabel nodes
  195. for idx, node in enumerate(G.nodes()):
  196. G.nodes[node][node_label] = set_compressed[all_multisets[idx]]
  197. # get the set of compressed labels
  198. labels_comp = list(nx.get_node_attributes(G, node_label).values())
  199. # all_labels_ori.update(labels_comp)
  200. all_num_of_each_label.append(dict(Counter(labels_comp)))
  201. # calculate subtree kernel with h iterations and add it to the final kernel
  202. compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False)
  203. return Kmatrix
  204. def wl_iteration(G, node_label):
  205. all_multisets = []
  206. for node, attrs in G.nodes(data=True):
  207. # Multiset-label determination.
  208. multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]]
  209. # sorting each multiset
  210. multiset.sort()
  211. multiset = [attrs[node_label]] + multiset # add the prefix
  212. all_multisets.append(tuple(multiset))
  213. # # label compression
  214. # set_unique = list(set(all_multisets)) # set of unique multiset labels
  215. return all_multisets
  216. # # a dictionary mapping original labels to new ones.
  217. # set_compressed = {}
  218. # # if a label occured before, assign its former compressed label,
  219. # # else assign the number of labels occured + 1 as the compressed label.
  220. # for value in set_unique:
  221. # if value in all_set_compressed.keys():
  222. # set_compressed.update({value: all_set_compressed[value]})
  223. # else:
  224. # set_compressed.update({value: str(num_of_labels_occured + 1)})
  225. # num_of_labels_occured += 1
  226. #
  227. # all_set_compressed.update(set_compressed)
  228. #
  229. # # relabel nodes
  230. # for idx, node in enumerate(G.nodes()):
  231. # G.nodes[node][node_label] = set_compressed[all_multisets[idx]]
  232. #
  233. # # get the set of compressed labels
  234. # labels_comp = list(nx.get_node_attributes(G, node_label).values())
  235. # all_labels_ori.update(labels_comp)
  236. # all_num_of_each_label.append(dict(Counter(labels_comp)))
  237. # return
  238. def wrapper_wl_iteration(node_label, itr_item):
  239. g = itr_item[0]
  240. i = itr_item[1]
  241. all_multisets = wl_iteration(g, node_label)
  242. return i, all_multisets
  243. def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, verbose):
  244. """Compute kernel matrix using the base kernel.
  245. """
  246. if parallel == 'imap_unordered':
  247. # compute kernels.
  248. def init_worker(alllabels_toshare):
  249. global G_alllabels
  250. G_alllabels = alllabels_toshare
  251. do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix)
  252. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  253. glbv=(all_num_of_each_label,), n_jobs=n_jobs, verbose=verbose)
  254. elif parallel == None:
  255. for i in range(len(Kmatrix)):
  256. for j in range(i, len(Kmatrix)):
  257. Kmatrix[i][j] = compute_subtree_kernel(all_num_of_each_label[i],
  258. all_num_of_each_label[j], Kmatrix[i][j])
  259. Kmatrix[j][i] = Kmatrix[i][j]
  260. def compute_subtree_kernel(num_of_each_label1, num_of_each_label2, kernel):
  261. """Compute the subtree kernel.
  262. """
  263. labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys()))
  264. vector1 = np.array([(num_of_each_label1[label]
  265. if (label in num_of_each_label1.keys()) else 0)
  266. for label in labels])
  267. vector2 = np.array([(num_of_each_label2[label]
  268. if (label in num_of_each_label2.keys()) else 0)
  269. for label in labels])
  270. kernel += np.dot(vector1, vector2)
  271. return kernel
  272. def wrapper_compute_subtree_kernel(Kmatrix, itr):
  273. i = itr[0]
  274. j = itr[1]
  275. return i, j, compute_subtree_kernel(G_alllabels[i], G_alllabels[j], Kmatrix[i][j])
  276. def _wl_spkernel_do(Gn, node_label, edge_label, height):
  277. """Calculate Weisfeiler-Lehman shortest path kernels between graphs.
  278. Parameters
  279. ----------
  280. Gn : List of NetworkX graph
  281. List of graphs between which the kernels are calculated.
  282. node_label : string
  283. node attribute used as label.
  284. edge_label : string
  285. edge attribute used as label.
  286. height : int
  287. subtree height.
  288. Return
  289. ------
  290. Kmatrix : Numpy matrix
  291. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  292. """
  293. pass
  294. from pygraph.utils.utils import getSPGraph
  295. # init.
  296. height = int(height)
  297. Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  298. Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn
  299. # initial for height = 0
  300. for i in range(0, len(Gn)):
  301. for j in range(i, len(Gn)):
  302. for e1 in Gn[i].edges(data = True):
  303. for e2 in Gn[j].edges(data = True):
  304. if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  305. Kmatrix[i][j] += 1
  306. Kmatrix[j][i] = Kmatrix[i][j]
  307. # iterate each height
  308. for h in range(1, height + 1):
  309. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  310. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  311. for G in Gn: # for each graph
  312. set_multisets = []
  313. for node in G.nodes(data = True):
  314. # Multiset-label determination.
  315. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  316. # sorting each multiset
  317. multiset.sort()
  318. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  319. set_multisets.append(multiset)
  320. # label compression
  321. set_unique = list(set(set_multisets)) # set of unique multiset labels
  322. # a dictionary mapping original labels to new ones.
  323. set_compressed = {}
  324. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  325. for value in set_unique:
  326. if value in all_set_compressed.keys():
  327. set_compressed.update({ value : all_set_compressed[value] })
  328. else:
  329. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  330. num_of_labels_occured += 1
  331. all_set_compressed.update(set_compressed)
  332. # relabel nodes
  333. for node in G.nodes(data = True):
  334. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  335. # calculate subtree kernel with h iterations and add it to the final kernel
  336. for i in range(0, len(Gn)):
  337. for j in range(i, len(Gn)):
  338. for e1 in Gn[i].edges(data = True):
  339. for e2 in Gn[j].edges(data = True):
  340. if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  341. Kmatrix[i][j] += 1
  342. Kmatrix[j][i] = Kmatrix[i][j]
  343. return Kmatrix
  344. def _wl_edgekernel_do(Gn, node_label, edge_label, height):
  345. """Calculate Weisfeiler-Lehman edge kernels between graphs.
  346. Parameters
  347. ----------
  348. Gn : List of NetworkX graph
  349. List of graphs between which the kernels are calculated.
  350. node_label : string
  351. node attribute used as label.
  352. edge_label : string
  353. edge attribute used as label.
  354. height : int
  355. subtree height.
  356. Return
  357. ------
  358. Kmatrix : Numpy matrix
  359. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  360. """
  361. pass
  362. # init.
  363. height = int(height)
  364. Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  365. # initial for height = 0
  366. for i in range(0, len(Gn)):
  367. for j in range(i, len(Gn)):
  368. for e1 in Gn[i].edges(data = True):
  369. for e2 in Gn[j].edges(data = True):
  370. if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  371. Kmatrix[i][j] += 1
  372. Kmatrix[j][i] = Kmatrix[i][j]
  373. # iterate each height
  374. for h in range(1, height + 1):
  375. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  376. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  377. for G in Gn: # for each graph
  378. set_multisets = []
  379. for node in G.nodes(data = True):
  380. # Multiset-label determination.
  381. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  382. # sorting each multiset
  383. multiset.sort()
  384. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  385. set_multisets.append(multiset)
  386. # label compression
  387. set_unique = list(set(set_multisets)) # set of unique multiset labels
  388. # a dictionary mapping original labels to new ones.
  389. set_compressed = {}
  390. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  391. for value in set_unique:
  392. if value in all_set_compressed.keys():
  393. set_compressed.update({ value : all_set_compressed[value] })
  394. else:
  395. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  396. num_of_labels_occured += 1
  397. all_set_compressed.update(set_compressed)
  398. # relabel nodes
  399. for node in G.nodes(data = True):
  400. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  401. # calculate subtree kernel with h iterations and add it to the final kernel
  402. for i in range(0, len(Gn)):
  403. for j in range(i, len(Gn)):
  404. for e1 in Gn[i].edges(data = True):
  405. for e2 in Gn[j].edges(data = True):
  406. if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  407. Kmatrix[i][j] += 1
  408. Kmatrix[j][i] = Kmatrix[i][j]
  409. return Kmatrix
  410. def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):
  411. """Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs.
  412. Parameters
  413. ----------
  414. Gn : List of NetworkX graph
  415. List of graphs between which the kernels are calculated.
  416. node_label : string
  417. node attribute used as label.
  418. edge_label : string
  419. edge attribute used as label.
  420. height : int
  421. subtree height.
  422. base_kernel : string
  423. Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
  424. Return
  425. ------
  426. Kmatrix : Numpy matrix
  427. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  428. """
  429. pass
  430. # init.
  431. height = int(height)
  432. Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  433. # initial for height = 0
  434. Kmatrix = base_kernel(Gn, node_label, edge_label)
  435. # iterate each height
  436. for h in range(1, height + 1):
  437. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  438. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  439. for G in Gn: # for each graph
  440. set_multisets = []
  441. for node in G.nodes(data = True):
  442. # Multiset-label determination.
  443. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  444. # sorting each multiset
  445. multiset.sort()
  446. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  447. set_multisets.append(multiset)
  448. # label compression
  449. set_unique = list(set(set_multisets)) # set of unique multiset labels
  450. # a dictionary mapping original labels to new ones.
  451. set_compressed = {}
  452. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  453. for value in set_unique:
  454. if value in all_set_compressed.keys():
  455. set_compressed.update({ value : all_set_compressed[value] })
  456. else:
  457. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  458. num_of_labels_occured += 1
  459. all_set_compressed.update(set_compressed)
  460. # relabel nodes
  461. for node in G.nodes(data = True):
  462. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  463. # calculate kernel with h iterations and add it to the final kernel
  464. Kmatrix += base_kernel(Gn, node_label, edge_label)
  465. return Kmatrix

A Python package for graph kernels, graph edit distances and graph pre-image problem.