You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_parallel_chunksize.py 27 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Test of parallel, find the best parallel chunksize and iteration seperation scheme.
  5. Created on Wed Sep 26 12:09:34 2018
  6. @author: ljia
  7. """
  8. import sys
  9. import time
  10. from itertools import combinations_with_replacement, product, combinations
  11. from functools import partial
  12. from multiprocessing import Pool
  13. from tqdm import tqdm
  14. import networkx as nx
  15. import numpy as np
  16. import functools
  17. #import multiprocessing
  18. from matplotlib import pyplot as plt
  19. from sklearn.model_selection import ParameterGrid
  20. sys.path.insert(0, "../")
  21. sys.path.insert(0, "../../")
  22. from libs import *
  23. from gklearn.utils.utils import getSPGraph, direct_product
  24. from gklearn.utils.graphdataset import get_dataset_attributes
  25. from gklearn.utils.graphfiles import loadDataset
  26. from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct
  27. def spkernel(*args,
  28. node_label='atom',
  29. edge_weight=None,
  30. node_kernels=None,
  31. n_jobs=None,
  32. chunksize=1):
  33. """Calculate shortest-path kernels between graphs.
  34. """
  35. # pre-process
  36. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  37. weight = None
  38. if edge_weight is None:
  39. print('\n None edge weight specified. Set all weight to 1.\n')
  40. else:
  41. try:
  42. some_weight = list(
  43. nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
  44. if isinstance(some_weight, (float, int)):
  45. weight = edge_weight
  46. else:
  47. print(
  48. '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
  49. % edge_weight)
  50. except:
  51. print(
  52. '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
  53. % edge_weight)
  54. ds_attrs = get_dataset_attributes(
  55. Gn,
  56. attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
  57. node_label=node_label)
  58. # remove graphs with no edges, as no sp can be found in their structures,
  59. # so the kernel between such a graph and itself will be zero.
  60. len_gn = len(Gn)
  61. Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
  62. idx = [G[0] for G in Gn]
  63. Gn = [G[1] for G in Gn]
  64. if len(Gn) != len_gn:
  65. print('\n %d graphs are removed as they don\'t contain edges.\n' %
  66. (len_gn - len(Gn)))
  67. start_time = time.time()
  68. pool = Pool(n_jobs)
  69. # get shortest path graphs of Gn
  70. getsp_partial = partial(wrapper_getSPGraph, weight)
  71. itr = zip(Gn, range(0, len(Gn)))
  72. for i, g in tqdm(
  73. pool.imap_unordered(getsp_partial, itr, chunksize),
  74. desc='getting sp graphs', file=sys.stdout):
  75. Gn[i] = g
  76. pool.close()
  77. pool.join()
  78. Kmatrix = np.zeros((len(Gn), len(Gn)))
  79. # ---- use pool.imap_unordered to parallel and track progress. ----
  80. def init_worker(gn_toshare):
  81. global G_gn
  82. G_gn = gn_toshare
  83. do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels)
  84. itr = combinations_with_replacement(range(0, len(Gn)), 2)
  85. with Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,)) as pool:
  86. for i, j, kernel in tqdm(pool.imap_unordered(do_partial, itr, chunksize),
  87. desc='calculating kernels', file=sys.stdout):
  88. Kmatrix[i][j] = kernel
  89. Kmatrix[j][i] = kernel
  90. # # ---- direct running, normally use single CPU core. ----
  91. # itr = combinations_with_replacement(range(0, len(Gn)), 2)
  92. # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
  93. # kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels)
  94. # Kmatrix[i][j] = kernel
  95. # Kmatrix[j][i] = kernel
  96. run_time = time.time() - start_time
  97. print(
  98. "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
  99. % (len(Gn), run_time))
  100. return Kmatrix, run_time, idx
  101. def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels):
  102. kernel = 0
  103. # compute shortest path matrices first, method borrowed from FCSP.
  104. if ds_attrs['node_labeled']:
  105. # node symb and non-synb labeled
  106. if ds_attrs['node_attr_dim'] > 0:
  107. kn = node_kernels['mix']
  108. vk_dict = {} # shortest path matrices dict
  109. for n1, n2 in product(
  110. g1.nodes(data=True), g2.nodes(data=True)):
  111. vk_dict[(n1[0], n2[0])] = kn(
  112. n1[1][node_label], n2[1][node_label],
  113. n1[1]['attributes'], n2[1]['attributes'])
  114. # node symb labeled
  115. else:
  116. kn = node_kernels['symb']
  117. vk_dict = {} # shortest path matrices dict
  118. for n1 in g1.nodes(data=True):
  119. for n2 in g2.nodes(data=True):
  120. vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
  121. n2[1][node_label])
  122. else:
  123. # node non-synb labeled
  124. if ds_attrs['node_attr_dim'] > 0:
  125. kn = node_kernels['nsymb']
  126. vk_dict = {} # shortest path matrices dict
  127. for n1 in g1.nodes(data=True):
  128. for n2 in g2.nodes(data=True):
  129. vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
  130. n2[1]['attributes'])
  131. # node unlabeled
  132. else:
  133. for e1, e2 in product(
  134. g1.edges(data=True), g2.edges(data=True)):
  135. if e1[2]['cost'] == e2[2]['cost']:
  136. kernel += 1
  137. return kernel
  138. # compute graph kernels
  139. if ds_attrs['is_directed']:
  140. for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
  141. if e1[2]['cost'] == e2[2]['cost']:
  142. nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1],
  143. e2[1])]
  144. kn1 = nk11 * nk22
  145. kernel += kn1
  146. else:
  147. for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
  148. if e1[2]['cost'] == e2[2]['cost']:
  149. # each edge walk is counted twice, starting from both its extreme nodes.
  150. nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(
  151. e1[0], e2[1])], vk_dict[(e1[1],
  152. e2[0])], vk_dict[(e1[1],
  153. e2[1])]
  154. kn1 = nk11 * nk22
  155. kn2 = nk12 * nk21
  156. kernel += kn1 + kn2
  157. return kernel
  158. def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr):
  159. i = itr[0]
  160. j = itr[1]
  161. return i, j, spkernel_do(G_gn[i], G_gn[j], ds_attrs, node_label, node_kernels)
  162. def wrapper_getSPGraph(weight, itr_item):
  163. g = itr_item[0]
  164. i = itr_item[1]
  165. return i, getSPGraph(g, edge_weight=weight)
  166. #
  167. #
  168. #def commonwalkkernel(*args,
  169. # node_label='atom',
  170. # edge_label='bond_type',
  171. # n=None,
  172. # weight=1,
  173. # compute_method=None,
  174. # n_jobs=None,
  175. # chunksize=1):
  176. # """Calculate common walk graph kernels between graphs.
  177. # """
  178. # compute_method = compute_method.lower()
  179. # # arrange all graphs in a list
  180. # Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  181. # Kmatrix = np.zeros((len(Gn), len(Gn)))
  182. # ds_attrs = get_dataset_attributes(
  183. # Gn,
  184. # attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
  185. # node_label=node_label,
  186. # edge_label=edge_label)
  187. # if not ds_attrs['node_labeled']:
  188. # for G in Gn:
  189. # nx.set_node_attributes(G, '0', 'atom')
  190. # if not ds_attrs['edge_labeled']:
  191. # for G in Gn:
  192. # nx.set_edge_attributes(G, '0', 'bond_type')
  193. # if not ds_attrs['is_directed']: # convert
  194. # Gn = [G.to_directed() for G in Gn]
  195. #
  196. # start_time = time.time()
  197. #
  198. # # ---- use pool.imap_unordered to parallel and track progress. ----
  199. # pool = Pool(n_jobs)
  200. # itr = combinations_with_replacement(range(0, len(Gn)), 2)
  201. ## len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  202. ## if len_itr < 100:
  203. ## chunksize, extra = divmod(len_itr, n_jobs * 4)
  204. ## if extra:
  205. ## chunksize += 1
  206. ## else:
  207. ## chunksize = 100
  208. #
  209. # # direct product graph method - exponential
  210. # if compute_method == 'exp':
  211. # do_partial = partial(_commonwalkkernel_exp, Gn, node_label, edge_label,
  212. # weight)
  213. # # direct product graph method - geometric
  214. # elif compute_method == 'geo':
  215. # do_partial = partial(_commonwalkkernel_geo, Gn, node_label, edge_label,
  216. # weight)
  217. #
  218. # for i, j, kernel in tqdm(
  219. # pool.imap_unordered(do_partial, itr, chunksize),
  220. # desc='calculating kernels',
  221. # file=sys.stdout):
  222. # Kmatrix[i][j] = kernel
  223. # Kmatrix[j][i] = kernel
  224. # pool.close()
  225. # pool.join()
  226. #
  227. # run_time = time.time() - start_time
  228. # print(
  229. # "\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---"
  230. # % (len(Gn), run_time))
  231. #
  232. # return Kmatrix, run_time
  233. #
  234. #
  235. #def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij):
  236. # """Calculate walk graph kernels up to n between 2 graphs using exponential
  237. # series.
  238. # """
  239. # i = ij[0]
  240. # j = ij[1]
  241. # g1 = Gn[i]
  242. # g2 = Gn[j]
  243. #
  244. # # get tensor product / direct product
  245. # gp = direct_product(g1, g2, node_label, edge_label)
  246. # A = nx.adjacency_matrix(gp).todense()
  247. #
  248. # ew, ev = np.linalg.eig(A)
  249. # D = np.zeros((len(ew), len(ew)))
  250. # for i in range(len(ew)):
  251. # D[i][i] = np.exp(beta * ew[i])
  252. # exp_D = ev * D * ev.T
  253. #
  254. # return i, j, exp_D.sum()
  255. #
  256. #
  257. #def _commonwalkkernel_geo(Gn, node_label, edge_label, gamma, ij):
  258. # """Calculate common walk graph kernels up to n between 2 graphs using
  259. # geometric series.
  260. # """
  261. # i = ij[0]
  262. # j = ij[1]
  263. # g1 = Gn[i]
  264. # g2 = Gn[j]
  265. #
  266. # # get tensor product / direct product
  267. # gp = direct_product(g1, g2, node_label, edge_label)
  268. # A = nx.adjacency_matrix(gp).todense()
  269. # mat = np.identity(len(A)) - gamma * A
  270. # try:
  271. # return i, j, mat.I.sum()
  272. # except np.linalg.LinAlgError:
  273. # return i, j, np.nan
  274. #def structuralspkernel(*args,
  275. # node_label='atom',
  276. # edge_weight=None,
  277. # edge_label='bond_type',
  278. # node_kernels=None,
  279. # edge_kernels=None,
  280. # n_jobs=None,
  281. # chunksize=1):
  282. # """Calculate mean average structural shortest path kernels between graphs.
  283. # """
  284. # # pre-process
  285. # Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  286. #
  287. # weight = None
  288. # if edge_weight is None:
  289. # print('\n None edge weight specified. Set all weight to 1.\n')
  290. # else:
  291. # try:
  292. # some_weight = list(
  293. # nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
  294. # if isinstance(some_weight, (float, int)):
  295. # weight = edge_weight
  296. # else:
  297. # print(
  298. # '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
  299. # % edge_weight)
  300. # except:
  301. # print(
  302. # '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
  303. # % edge_weight)
  304. # ds_attrs = get_dataset_attributes(
  305. # Gn,
  306. # attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
  307. # 'edge_attr_dim', 'is_directed'],
  308. # node_label=node_label, edge_label=edge_label)
  309. #
  310. # start_time = time.time()
  311. #
  312. # # get shortest paths of each graph in Gn
  313. # splist = [[] for _ in range(len(Gn))]
  314. # pool = Pool(n_jobs)
  315. # # get shortest path graphs of Gn
  316. # getsp_partial = partial(wrap_getSP, Gn, weight, ds_attrs['is_directed'])
  317. ## if len(Gn) < 100:
  318. ## # use default chunksize as pool.map when iterable is less than 100
  319. ## chunksize, extra = divmod(len(Gn), n_jobs * 4)
  320. ## if extra:
  321. ## chunksize += 1
  322. ## else:
  323. ## chunksize = 100
  324. # # chunksize = 300 # int(len(list(itr)) / n_jobs)
  325. # for i, sp in tqdm(
  326. # pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize),
  327. # desc='getting shortest paths',
  328. # file=sys.stdout):
  329. # splist[i] = sp
  330. #
  331. # Kmatrix = np.zeros((len(Gn), len(Gn)))
  332. #
  333. # # ---- use pool.imap_unordered to parallel and track progress. ----
  334. # do_partial = partial(structuralspkernel_do, Gn, splist, ds_attrs,
  335. # node_label, edge_label, node_kernels, edge_kernels)
  336. # itr = combinations_with_replacement(range(0, len(Gn)), 2)
  337. ## len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  338. ## if len_itr < 100:
  339. ## chunksize, extra = divmod(len_itr, n_jobs * 4)
  340. ## if extra:
  341. ## chunksize += 1
  342. ## else:
  343. ## chunksize = 100
  344. # for i, j, kernel in tqdm(
  345. # pool.imap_unordered(do_partial, itr, chunksize),
  346. # desc='calculating kernels',
  347. # file=sys.stdout):
  348. # Kmatrix[i][j] = kernel
  349. # Kmatrix[j][i] = kernel
  350. # pool.close()
  351. # pool.join()
  352. #
  353. # run_time = time.time() - start_time
  354. # print(
  355. # "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
  356. # % (len(Gn), run_time))
  357. #
  358. # return Kmatrix, run_time
  359. #
  360. #
  361. #def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label,
  362. # node_kernels, edge_kernels, ij):
  363. #
  364. # iglobal = ij[0]
  365. # jglobal = ij[1]
  366. # g1 = Gn[iglobal]
  367. # g2 = Gn[jglobal]
  368. # spl1 = splist[iglobal]
  369. # spl2 = splist[jglobal]
  370. # kernel = 0
  371. #
  372. # try:
  373. # # First, compute shortest path matrices, method borrowed from FCSP.
  374. # if ds_attrs['node_labeled']:
  375. # # node symb and non-synb labeled
  376. # if ds_attrs['node_attr_dim'] > 0:
  377. # kn = node_kernels['mix']
  378. # vk_dict = {} # shortest path matrices dict
  379. # for n1, n2 in product(
  380. # g1.nodes(data=True), g2.nodes(data=True)):
  381. # vk_dict[(n1[0], n2[0])] = kn(
  382. # n1[1][node_label], n2[1][node_label],
  383. # [n1[1]['attributes']], [n2[1]['attributes']])
  384. # # node symb labeled
  385. # else:
  386. # kn = node_kernels['symb']
  387. # vk_dict = {} # shortest path matrices dict
  388. # for n1 in g1.nodes(data=True):
  389. # for n2 in g2.nodes(data=True):
  390. # vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
  391. # n2[1][node_label])
  392. # else:
  393. # # node non-synb labeled
  394. # if ds_attrs['node_attr_dim'] > 0:
  395. # kn = node_kernels['nsymb']
  396. # vk_dict = {} # shortest path matrices dict
  397. # for n1 in g1.nodes(data=True):
  398. # for n2 in g2.nodes(data=True):
  399. # vk_dict[(n1[0], n2[0])] = kn([n1[1]['attributes']],
  400. # [n2[1]['attributes']])
  401. # # node unlabeled
  402. # else:
  403. # vk_dict = {}
  404. #
  405. # # Then, compute kernels between all pairs of edges, which idea is an
  406. # # extension of FCSP. It suits sparse graphs, which is the most case we
  407. # # went though. For dense graphs, it would be slow.
  408. # if ds_attrs['edge_labeled']:
  409. # # edge symb and non-synb labeled
  410. # if ds_attrs['edge_attr_dim'] > 0:
  411. # ke = edge_kernels['mix']
  412. # ek_dict = {} # dict of edge kernels
  413. # for e1, e2 in product(
  414. # g1.edges(data=True), g2.edges(data=True)):
  415. # ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ke(
  416. # e1[2][edge_label], e2[2][edge_label],
  417. # [e1[2]['attributes']], [e2[2]['attributes']])
  418. # # edge symb labeled
  419. # else:
  420. # ke = edge_kernels['symb']
  421. # ek_dict = {}
  422. # for e1 in g1.edges(data=True):
  423. # for e2 in g2.edges(data=True):
  424. # ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ke(
  425. # e1[2][edge_label], e2[2][edge_label])
  426. # else:
  427. # # edge non-synb labeled
  428. # if ds_attrs['edge_attr_dim'] > 0:
  429. # ke = edge_kernels['nsymb']
  430. # ek_dict = {}
  431. # for e1 in g1.edges(data=True):
  432. # for e2 in g2.edges(data=True):
  433. # ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = kn(
  434. # [e1[2]['attributes']], [e2[2]['attributes']])
  435. # # edge unlabeled
  436. # else:
  437. # ek_dict = {}
  438. #
  439. # # compute graph kernels
  440. # if vk_dict:
  441. # if ek_dict:
  442. # for p1, p2 in product(spl1, spl2):
  443. # if len(p1) == len(p2):
  444. # kpath = vk_dict[(p1[0], p2[0])]
  445. # if kpath:
  446. # for idx in range(1, len(p1)):
  447. # kpath *= vk_dict[(p1[idx], p2[idx])] * \
  448. # ek_dict[((p1[idx-1], p1[idx]),
  449. # (p2[idx-1], p2[idx]))]
  450. # if not kpath:
  451. # break
  452. # kernel += kpath # add up kernels of all paths
  453. # else:
  454. # for p1, p2 in product(spl1, spl2):
  455. # if len(p1) == len(p2):
  456. # kpath = vk_dict[(p1[0], p2[0])]
  457. # if kpath:
  458. # for idx in range(1, len(p1)):
  459. # kpath *= vk_dict[(p1[idx], p2[idx])]
  460. # if not kpath:
  461. # break
  462. # kernel += kpath # add up kernels of all paths
  463. # else:
  464. # if ek_dict:
  465. # for p1, p2 in product(spl1, spl2):
  466. # if len(p1) == len(p2):
  467. # if len(p1) == 0:
  468. # kernel += 1
  469. # else:
  470. # kpath = 1
  471. # for idx in range(0, len(p1) - 1):
  472. # kpath *= ek_dict[((p1[idx], p1[idx+1]),
  473. # (p2[idx], p2[idx+1]))]
  474. # if not kpath:
  475. # break
  476. # kernel += kpath # add up kernels of all paths
  477. # else:
  478. # for p1, p2 in product(spl1, spl2):
  479. # if len(p1) == len(p2):
  480. # kernel += 1
  481. #
  482. # kernel = kernel / (len(spl1) * len(spl2)) # calculate mean average
  483. # except KeyError: # missing labels or attributes
  484. # pass
  485. #
  486. # return iglobal, jglobal, kernel
  487. #
  488. #
  489. #def get_shortest_paths(G, weight, directed):
  490. # """Get all shortest paths of a graph.
  491. # """
  492. # sp = []
  493. # for n1, n2 in combinations(G.nodes(), 2):
  494. # try:
  495. # sptemp = nx.shortest_path(G, n1, n2, weight=weight)
  496. # sp.append(sptemp)
  497. # # each edge walk is counted twice, starting from both its extreme nodes.
  498. # if not directed:
  499. # sp.append(sptemp[::-1])
  500. # except nx.NetworkXNoPath: # nodes not connected
  501. # # sp.append([])
  502. # pass
  503. # # add single nodes as length 0 paths.
  504. # sp += [[n] for n in G.nodes()]
  505. # return sp
  506. #
  507. #
  508. #def wrap_getSP(Gn, weight, directed, i):
  509. # return i, get_shortest_paths(Gn[i], weight, directed)
  510. def compute_gram_matrices(datafile,
  511. estimator,
  512. param_grid_precomputed,
  513. datafile_y=None,
  514. extra_params=None,
  515. ds_name='ds-unknown',
  516. n_jobs=1,
  517. chunksize=1):
  518. """
  519. Parameters
  520. ----------
  521. datafile : string
  522. Path of dataset file.
  523. estimator : function
  524. kernel function used to estimate. This function needs to return a gram matrix.
  525. param_grid_precomputed : dictionary
  526. Dictionary with names (string) of parameters used to calculate gram matrices as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. Params with length 1 will be omitted.
  527. datafile_y : string
  528. Path of file storing y data. This parameter is optional depending on the given dataset file.
  529. """
  530. tqdm.monitor_interval = 0
  531. # Load the dataset
  532. dataset, y_all = loadDataset(
  533. datafile, filename_y=datafile_y, extra_params=extra_params)
  534. # Grid of parameters with a discrete number of values for each.
  535. param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
  536. gram_matrix_time = [
  537. ] # a list to store time to calculate gram matrices
  538. # calculate all gram matrices
  539. for idx, params_out in enumerate(param_list_precomputed):
  540. y = y_all[:]
  541. params_out['n_jobs'] = n_jobs
  542. params_out['chunksize'] = chunksize
  543. rtn_data = estimator(dataset[:], **params_out)
  544. Kmatrix = rtn_data[0]
  545. current_run_time = rtn_data[1]
  546. # for some kernels, some graphs in datasets may not meet the
  547. # kernels' requirements for graph structure. These graphs are trimmed.
  548. if len(rtn_data) == 3:
  549. idx_trim = rtn_data[2] # the index of trimmed graph list
  550. y = [y[idx] for idx in idx_trim] # trim y accordingly
  551. Kmatrix_diag = Kmatrix.diagonal().copy()
  552. # remove graphs whose kernels with themselves are zeros
  553. nb_g_ignore = 0
  554. for idx, diag in enumerate(Kmatrix_diag):
  555. if diag == 0:
  556. Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=0)
  557. Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=1)
  558. nb_g_ignore += 1
  559. # normalization
  560. for i in range(len(Kmatrix)):
  561. for j in range(i, len(Kmatrix)):
  562. Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
  563. Kmatrix[j][i] = Kmatrix[i][j]
  564. gram_matrix_time.append(current_run_time)
  565. average_gram_matrix_time = np.mean(gram_matrix_time)
  566. return average_gram_matrix_time
  567. dslist = [
  568. {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds',
  569. 'task': 'regression'}, # node symb
  570. {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', 'task': 'regression',
  571. 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt', }, # contains single node graph, node symb
  572. {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds', }, # node/edge symb
  573. {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds', }, # unlabeled
  574. {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat',
  575. 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
  576. {'name': 'Letter-med', 'dataset': '../../datasets/Letter-med/Letter-med_A.txt'},
  577. # node symb/nsymb
  578. {'name': 'ENZYMES', 'dataset': '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
  579. # node/edge symb
  580. # {'name': 'Mutagenicity', 'dataset': '../../datasets/Mutagenicity/Mutagenicity_A.txt'},
  581. # {'name': 'D&D', 'dataset': '../../datasets/D&D/DD.mat',
  582. # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb
  583. ]
  584. fig, ax = plt.subplots()
  585. ax.set_xscale('log', nonposx='clip')
  586. ax.set_yscale('log', nonposy='clip')
  587. ax.set_xlabel('parallel chunksize')
  588. ax.set_ylabel('runtime($s$)')
  589. ax.set_title('28 cpus')
  590. ax.grid(axis='both')
  591. estimator = spkernel
  592. if estimator.__name__ == 'spkernel':
  593. mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
  594. param_grid_precomputed = {'node_kernels': [
  595. {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}
  596. elif estimator.__name__ == 'commonwalkkernel':
  597. mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
  598. param_grid_precomputed = {'compute_method': ['geo'],
  599. 'weight': [1]}
  600. elif estimator.__name__ == 'structuralspkernel':
  601. mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
  602. param_grid_precomputed = {'node_kernels':
  603. [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}],
  604. 'edge_kernels':
  605. [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}
  606. #list(range(10, 100, 20)) +
  607. #chunklist = list(range(10, 100, 20)) + list(range(100, 1000, 200)) + \
  608. # list(range(1000, 10000, 2000)) + list(range(10000, 100000, 20000))
  609. # chunklist = list(range(300, 1000, 200)) + list(range(1000, 10000, 2000)) + list(range(10000, 100000, 20000))
  610. chunklist = list(range(10, 100, 10)) + list(range(100, 1000, 100)) + \
  611. list(range(1000, 10000, 1000)) + list(range(10000, 100000, 10000))
  612. #chunklist = list(range(1000, 10000, 1000))
  613. gmtmat = np.zeros((len(dslist), len(chunklist)))
  614. cpus = 28
  615. for idx1, ds in enumerate(dslist):
  616. print()
  617. print(ds['name'])
  618. for idx2, cs in enumerate(chunklist):
  619. print(ds['name'], idx2, cs)
  620. gmtmat[idx1][idx2] = compute_gram_matrices(
  621. ds['dataset'],
  622. estimator,
  623. param_grid_precomputed,
  624. datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
  625. extra_params=(ds['extra_params']
  626. if 'extra_params' in ds else None),
  627. ds_name=ds['name'],
  628. n_jobs=cpus,
  629. chunksize=cs)
  630. print()
  631. print(gmtmat[idx1, :])
  632. np.save('../test_parallel/' + estimator.__name__ + '.' + ds['name'] + '_' +
  633. str(idx1), gmtmat[idx1, :])
  634. p = ax.plot(chunklist, gmtmat[idx1, :], '.-', label=ds['name'], zorder=3)
  635. ax.legend(loc='upper right', ncol=3, labelspacing=0.1, handletextpad=0.4,
  636. columnspacing=0.6)
  637. plt.savefig('../test_parallel/' + estimator.__name__ + str(idx1) + '_' +
  638. str(cpus) + '.eps', format='eps', dpi=300)
  639. # plt.show()

A Python package for graph kernels, graph edit distances and graph pre-image problem.