You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

weisfeilerLehmanKernel.py 23 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562
  1. """
  2. @author: linlin
  3. @references:
  4. [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM.
  5. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research.
  6. 2011;12(Sep):2539-61.
  7. """
  8. import sys
  9. from collections import Counter
  10. sys.path.insert(0, "../")
  11. from functools import partial
  12. import time
  13. #from multiprocessing import Pool
  14. from tqdm import tqdm
  15. import networkx as nx
  16. import numpy as np
  17. #from pygraph.kernels.pathKernel import pathkernel
  18. from pygraph.utils.graphdataset import get_dataset_attributes
  19. from pygraph.utils.parallel import parallel_gm
  20. # @todo: support edge kernel, sp kernel, user-defined kernel.
  21. def weisfeilerlehmankernel(*args,
  22. node_label='atom',
  23. edge_label='bond_type',
  24. height=0,
  25. base_kernel='subtree',
  26. parallel=None,
  27. n_jobs=None,
  28. verbose=True):
  29. """Calculate Weisfeiler-Lehman kernels between graphs.
  30. Parameters
  31. ----------
  32. Gn : List of NetworkX graph
  33. List of graphs between which the kernels are calculated.
  34. /
  35. G1, G2 : NetworkX graphs
  36. Two graphs between which the kernel is calculated.
  37. node_label : string
  38. Node attribute used as label. The default node label is atom.
  39. edge_label : string
  40. Edge attribute used as label. The default edge label is bond_type.
  41. height : int
  42. Subtree height.
  43. base_kernel : string
  44. Base kernel used in each iteration of WL kernel. Only default 'subtree'
  45. kernel can be applied for now.
  46. # The default base
  47. # kernel is subtree kernel. For user-defined kernel, base_kernel is the
  48. # name of the base kernel function used in each iteration of WL kernel.
  49. # This function returns a Numpy matrix, each element of which is the
  50. # user-defined Weisfeiler-Lehman kernel between 2 praphs.
  51. parallel : None
  52. Which paralleliztion method is applied to compute the kernel. No
  53. parallelization can be applied for now.
  54. n_jobs : int
  55. Number of jobs for parallelization. The default is to use all
  56. computational cores. This argument is only valid when one of the
  57. parallelization method is applied and can be ignored for now.
  58. Return
  59. ------
  60. Kmatrix : Numpy matrix
  61. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  62. Notes
  63. -----
  64. This function now supports WL subtree kernel only.
  65. """
  66. # pre-process
  67. base_kernel = base_kernel.lower()
  68. Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list
  69. Gn = [g.copy() for g in Gn]
  70. ds_attrs = get_dataset_attributes(Gn, attr_names=['node_labeled'],
  71. node_label=node_label)
  72. if not ds_attrs['node_labeled']:
  73. for G in Gn:
  74. nx.set_node_attributes(G, '0', 'atom')
  75. start_time = time.time()
  76. # for WL subtree kernel
  77. if base_kernel == 'subtree':
  78. Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose)
  79. # for WL shortest path kernel
  80. elif base_kernel == 'sp':
  81. Kmatrix = _wl_spkernel_do(Gn, node_label, edge_label, height)
  82. # for WL edge kernel
  83. elif base_kernel == 'edge':
  84. Kmatrix = _wl_edgekernel_do(Gn, node_label, edge_label, height)
  85. # for user defined base kernel
  86. else:
  87. Kmatrix = _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel)
  88. run_time = time.time() - start_time
  89. if verbose:
  90. print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---"
  91. % (base_kernel, len(args[0]), run_time))
  92. return Kmatrix, run_time
  93. def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose):
  94. """Calculate Weisfeiler-Lehman kernels between graphs.
  95. Parameters
  96. ----------
  97. Gn : List of NetworkX graph
  98. List of graphs between which the kernels are calculated.
  99. node_label : string
  100. node attribute used as label.
  101. edge_label : string
  102. edge attribute used as label.
  103. height : int
  104. wl height.
  105. Return
  106. ------
  107. Kmatrix : Numpy matrix
  108. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  109. """
  110. height = int(height)
  111. Kmatrix = np.zeros((len(Gn), len(Gn)))
  112. # initial for height = 0
  113. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  114. # for each graph
  115. for G in Gn:
  116. # get the set of original labels
  117. labels_ori = list(nx.get_node_attributes(G, node_label).values())
  118. # number of occurence of each label in G
  119. all_num_of_each_label.append(dict(Counter(labels_ori)))
  120. # calculate subtree kernel with the 0th iteration and add it to the final kernel
  121. compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False)
  122. # iterate each height
  123. for h in range(1, height + 1):
  124. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  125. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  126. # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
  127. all_num_of_each_label = [] # number of occurence of each label in G
  128. # # for each graph
  129. # # ---- use pool.imap_unordered to parallel and track progress. ----
  130. # pool = Pool(n_jobs)
  131. # itr = zip(Gn, range(0, len(Gn)))
  132. # if len(Gn) < 100 * n_jobs:
  133. # chunksize = int(len(Gn) / n_jobs) + 1
  134. # else:
  135. # chunksize = 100
  136. # all_multisets_list = [[] for _ in range(len(Gn))]
  137. ## set_unique_list = [[] for _ in range(len(Gn))]
  138. # get_partial = partial(wrapper_wl_iteration, node_label)
  139. ## if verbose:
  140. ## iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize),
  141. ## desc='wl iteration', file=sys.stdout)
  142. ## else:
  143. # iterator = pool.imap_unordered(get_partial, itr, chunksize)
  144. # for i, all_multisets in iterator:
  145. # all_multisets_list[i] = all_multisets
  146. ## set_unique_list[i] = set_unique
  147. ## all_set_unique = all_set_unique | set(set_unique)
  148. # pool.close()
  149. # pool.join()
  150. # all_set_unique = set()
  151. # for uset in all_multisets_list:
  152. # all_set_unique = all_set_unique | set(uset)
  153. #
  154. # all_set_unique = list(all_set_unique)
  155. ## # a dictionary mapping original labels to new ones.
  156. ## set_compressed = {}
  157. ## for idx, uset in enumerate(all_set_unique):
  158. ## set_compressed.update({uset: idx})
  159. #
  160. # for ig, G in enumerate(Gn):
  161. #
  162. ## # a dictionary mapping original labels to new ones.
  163. ## set_compressed = {}
  164. ## # if a label occured before, assign its former compressed label,
  165. ## # else assign the number of labels occured + 1 as the compressed label.
  166. ## for value in set_unique_list[i]:
  167. ## if uset in all_set_unique:
  168. ## set_compressed.update({uset: all_set_compressed[value]})
  169. ## else:
  170. ## set_compressed.update({value: str(num_of_labels_occured + 1)})
  171. ## num_of_labels_occured += 1
  172. #
  173. ## all_set_compressed.update(set_compressed)
  174. #
  175. # # relabel nodes
  176. # for idx, node in enumerate(G.nodes()):
  177. # G.nodes[node][node_label] = all_set_unique.index(all_multisets_list[ig][idx])
  178. #
  179. # # get the set of compressed labels
  180. # labels_comp = list(nx.get_node_attributes(G, node_label).values())
  181. ## all_labels_ori.update(labels_comp)
  182. # all_num_of_each_label[ig] = dict(Counter(labels_comp))
  183. # all_set_unique = list(all_set_unique)
  184. # @todo: parallel this part.
  185. for idx, G in enumerate(Gn):
  186. all_multisets = []
  187. for node, attrs in G.nodes(data=True):
  188. # Multiset-label determination.
  189. multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]]
  190. # sorting each multiset
  191. multiset.sort()
  192. multiset = [attrs[node_label]] + multiset # add the prefix
  193. all_multisets.append(tuple(multiset))
  194. # label compression
  195. set_unique = list(set(all_multisets)) # set of unique multiset labels
  196. # a dictionary mapping original labels to new ones.
  197. set_compressed = {}
  198. # if a label occured before, assign its former compressed label,
  199. # else assign the number of labels occured + 1 as the compressed label.
  200. for value in set_unique:
  201. if value in all_set_compressed.keys():
  202. set_compressed.update({value: all_set_compressed[value]})
  203. else:
  204. set_compressed.update({value: str(num_of_labels_occured + 1)})
  205. num_of_labels_occured += 1
  206. all_set_compressed.update(set_compressed)
  207. # relabel nodes
  208. for idx, node in enumerate(G.nodes()):
  209. G.nodes[node][node_label] = set_compressed[all_multisets[idx]]
  210. # get the set of compressed labels
  211. labels_comp = list(nx.get_node_attributes(G, node_label).values())
  212. # all_labels_ori.update(labels_comp)
  213. all_num_of_each_label.append(dict(Counter(labels_comp)))
  214. # calculate subtree kernel with h iterations and add it to the final kernel
  215. compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False)
  216. return Kmatrix
  217. def wl_iteration(G, node_label):
  218. all_multisets = []
  219. for node, attrs in G.nodes(data=True):
  220. # Multiset-label determination.
  221. multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]]
  222. # sorting each multiset
  223. multiset.sort()
  224. multiset = [attrs[node_label]] + multiset # add the prefix
  225. all_multisets.append(tuple(multiset))
  226. # # label compression
  227. # set_unique = list(set(all_multisets)) # set of unique multiset labels
  228. return all_multisets
  229. # # a dictionary mapping original labels to new ones.
  230. # set_compressed = {}
  231. # # if a label occured before, assign its former compressed label,
  232. # # else assign the number of labels occured + 1 as the compressed label.
  233. # for value in set_unique:
  234. # if value in all_set_compressed.keys():
  235. # set_compressed.update({value: all_set_compressed[value]})
  236. # else:
  237. # set_compressed.update({value: str(num_of_labels_occured + 1)})
  238. # num_of_labels_occured += 1
  239. #
  240. # all_set_compressed.update(set_compressed)
  241. #
  242. # # relabel nodes
  243. # for idx, node in enumerate(G.nodes()):
  244. # G.nodes[node][node_label] = set_compressed[all_multisets[idx]]
  245. #
  246. # # get the set of compressed labels
  247. # labels_comp = list(nx.get_node_attributes(G, node_label).values())
  248. # all_labels_ori.update(labels_comp)
  249. # all_num_of_each_label.append(dict(Counter(labels_comp)))
  250. # return
  251. def wrapper_wl_iteration(node_label, itr_item):
  252. g = itr_item[0]
  253. i = itr_item[1]
  254. all_multisets = wl_iteration(g, node_label)
  255. return i, all_multisets
  256. def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, verbose):
  257. """Compute kernel matrix using the base kernel.
  258. """
  259. if parallel == 'imap_unordered':
  260. # compute kernels.
  261. def init_worker(alllabels_toshare):
  262. global G_alllabels
  263. G_alllabels = alllabels_toshare
  264. do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix)
  265. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  266. glbv=(all_num_of_each_label,), n_jobs=n_jobs, verbose=verbose)
  267. elif parallel == None:
  268. for i in range(len(Kmatrix)):
  269. for j in range(i, len(Kmatrix)):
  270. Kmatrix[i][j] = compute_subtree_kernel(all_num_of_each_label[i],
  271. all_num_of_each_label[j], Kmatrix[i][j])
  272. Kmatrix[j][i] = Kmatrix[i][j]
  273. def compute_subtree_kernel(num_of_each_label1, num_of_each_label2, kernel):
  274. """Compute the subtree kernel.
  275. """
  276. labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys()))
  277. vector1 = np.array([(num_of_each_label1[label]
  278. if (label in num_of_each_label1.keys()) else 0)
  279. for label in labels])
  280. vector2 = np.array([(num_of_each_label2[label]
  281. if (label in num_of_each_label2.keys()) else 0)
  282. for label in labels])
  283. kernel += np.dot(vector1, vector2)
  284. return kernel
  285. def wrapper_compute_subtree_kernel(Kmatrix, itr):
  286. i = itr[0]
  287. j = itr[1]
  288. return i, j, compute_subtree_kernel(G_alllabels[i], G_alllabels[j], Kmatrix[i][j])
  289. def _wl_spkernel_do(Gn, node_label, edge_label, height):
  290. """Calculate Weisfeiler-Lehman shortest path kernels between graphs.
  291. Parameters
  292. ----------
  293. Gn : List of NetworkX graph
  294. List of graphs between which the kernels are calculated.
  295. node_label : string
  296. node attribute used as label.
  297. edge_label : string
  298. edge attribute used as label.
  299. height : int
  300. subtree height.
  301. Return
  302. ------
  303. Kmatrix : Numpy matrix
  304. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  305. """
  306. pass
  307. from pygraph.utils.utils import getSPGraph
  308. # init.
  309. height = int(height)
  310. Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  311. Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn
  312. # initial for height = 0
  313. for i in range(0, len(Gn)):
  314. for j in range(i, len(Gn)):
  315. for e1 in Gn[i].edges(data = True):
  316. for e2 in Gn[j].edges(data = True):
  317. if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  318. Kmatrix[i][j] += 1
  319. Kmatrix[j][i] = Kmatrix[i][j]
  320. # iterate each height
  321. for h in range(1, height + 1):
  322. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  323. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  324. for G in Gn: # for each graph
  325. set_multisets = []
  326. for node in G.nodes(data = True):
  327. # Multiset-label determination.
  328. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  329. # sorting each multiset
  330. multiset.sort()
  331. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  332. set_multisets.append(multiset)
  333. # label compression
  334. set_unique = list(set(set_multisets)) # set of unique multiset labels
  335. # a dictionary mapping original labels to new ones.
  336. set_compressed = {}
  337. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  338. for value in set_unique:
  339. if value in all_set_compressed.keys():
  340. set_compressed.update({ value : all_set_compressed[value] })
  341. else:
  342. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  343. num_of_labels_occured += 1
  344. all_set_compressed.update(set_compressed)
  345. # relabel nodes
  346. for node in G.nodes(data = True):
  347. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  348. # calculate subtree kernel with h iterations and add it to the final kernel
  349. for i in range(0, len(Gn)):
  350. for j in range(i, len(Gn)):
  351. for e1 in Gn[i].edges(data = True):
  352. for e2 in Gn[j].edges(data = True):
  353. if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  354. Kmatrix[i][j] += 1
  355. Kmatrix[j][i] = Kmatrix[i][j]
  356. return Kmatrix
  357. def _wl_edgekernel_do(Gn, node_label, edge_label, height):
  358. """Calculate Weisfeiler-Lehman edge kernels between graphs.
  359. Parameters
  360. ----------
  361. Gn : List of NetworkX graph
  362. List of graphs between which the kernels are calculated.
  363. node_label : string
  364. node attribute used as label.
  365. edge_label : string
  366. edge attribute used as label.
  367. height : int
  368. subtree height.
  369. Return
  370. ------
  371. Kmatrix : Numpy matrix
  372. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  373. """
  374. pass
  375. # init.
  376. height = int(height)
  377. Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  378. # initial for height = 0
  379. for i in range(0, len(Gn)):
  380. for j in range(i, len(Gn)):
  381. for e1 in Gn[i].edges(data = True):
  382. for e2 in Gn[j].edges(data = True):
  383. if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  384. Kmatrix[i][j] += 1
  385. Kmatrix[j][i] = Kmatrix[i][j]
  386. # iterate each height
  387. for h in range(1, height + 1):
  388. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  389. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  390. for G in Gn: # for each graph
  391. set_multisets = []
  392. for node in G.nodes(data = True):
  393. # Multiset-label determination.
  394. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  395. # sorting each multiset
  396. multiset.sort()
  397. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  398. set_multisets.append(multiset)
  399. # label compression
  400. set_unique = list(set(set_multisets)) # set of unique multiset labels
  401. # a dictionary mapping original labels to new ones.
  402. set_compressed = {}
  403. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  404. for value in set_unique:
  405. if value in all_set_compressed.keys():
  406. set_compressed.update({ value : all_set_compressed[value] })
  407. else:
  408. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  409. num_of_labels_occured += 1
  410. all_set_compressed.update(set_compressed)
  411. # relabel nodes
  412. for node in G.nodes(data = True):
  413. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  414. # calculate subtree kernel with h iterations and add it to the final kernel
  415. for i in range(0, len(Gn)):
  416. for j in range(i, len(Gn)):
  417. for e1 in Gn[i].edges(data = True):
  418. for e2 in Gn[j].edges(data = True):
  419. if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  420. Kmatrix[i][j] += 1
  421. Kmatrix[j][i] = Kmatrix[i][j]
  422. return Kmatrix
  423. def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):
  424. """Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs.
  425. Parameters
  426. ----------
  427. Gn : List of NetworkX graph
  428. List of graphs between which the kernels are calculated.
  429. node_label : string
  430. node attribute used as label.
  431. edge_label : string
  432. edge attribute used as label.
  433. height : int
  434. subtree height.
  435. base_kernel : string
  436. Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
  437. Return
  438. ------
  439. Kmatrix : Numpy matrix
  440. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  441. """
  442. pass
  443. # init.
  444. height = int(height)
  445. Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  446. # initial for height = 0
  447. Kmatrix = base_kernel(Gn, node_label, edge_label)
  448. # iterate each height
  449. for h in range(1, height + 1):
  450. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  451. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  452. for G in Gn: # for each graph
  453. set_multisets = []
  454. for node in G.nodes(data = True):
  455. # Multiset-label determination.
  456. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  457. # sorting each multiset
  458. multiset.sort()
  459. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  460. set_multisets.append(multiset)
  461. # label compression
  462. set_unique = list(set(set_multisets)) # set of unique multiset labels
  463. # a dictionary mapping original labels to new ones.
  464. set_compressed = {}
  465. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  466. for value in set_unique:
  467. if value in all_set_compressed.keys():
  468. set_compressed.update({ value : all_set_compressed[value] })
  469. else:
  470. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  471. num_of_labels_occured += 1
  472. all_set_compressed.update(set_compressed)
  473. # relabel nodes
  474. for node in G.nodes(data = True):
  475. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  476. # calculate kernel with h iterations and add it to the final kernel
  477. Kmatrix += base_kernel(Gn, node_label, edge_label)
  478. return Kmatrix

A Python package for graph kernels, graph edit distances and graph pre-image problem.