You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dataset.py 24 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Thu Mar 26 18:48:27 2020
  5. @author: ljia
  6. """
  7. import numpy as np
  8. import networkx as nx
  9. import os
  10. from gklearn.dataset import DATASET_META, DataFetcher, DataLoader
  11. class Dataset(object):
  12. def __init__(self, inputs=None, root='datasets', filename_targets=None, targets=None, mode='networkx', clean_labels=True, reload=False, verbose=False, **kwargs):
  13. if inputs is None:
  14. self._graphs = None
  15. self._targets = None
  16. self._node_labels = None
  17. self._edge_labels = None
  18. self._node_attrs = None
  19. self._edge_attrs = None
  20. # If inputs is a list of graphs.
  21. elif isinstance(inputs, list):
  22. node_labels = kwargs.get('node_labels', None)
  23. node_attrs = kwargs.get('node_attrs', None)
  24. edge_labels = kwargs.get('edge_labels', None)
  25. edge_attrs = kwargs.get('edge_attrs', None)
  26. self.load_graphs(inputs, targets=targets)
  27. self.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs)
  28. if clean_labels:
  29. self.clean_labels()
  30. elif isinstance(inputs, str):
  31. # If inputs is predefined dataset name.
  32. if inputs in DATASET_META:
  33. self.load_predefined_dataset(inputs, root=root, clean_labels=clean_labels, reload=reload, verbose=verbose)
  34. # If inputs is a file name.
  35. else:
  36. self.load_dataset(inputs, filename_targets=filename_targets, clean_labels=clean_labels, **kwargs)
  37. else:
  38. raise TypeError('The "inputs" argument cannot be recoganized. "Inputs" can be a list of graphs, a predefined dataset name, or a file name of a dataset.')
  39. self._substructures = None
  40. self._node_label_dim = None
  41. self._edge_label_dim = None
  42. self._directed = None
  43. self._dataset_size = None
  44. self._total_node_num = None
  45. self._ave_node_num = None
  46. self._min_node_num = None
  47. self._max_node_num = None
  48. self._total_edge_num = None
  49. self._ave_edge_num = None
  50. self._min_edge_num = None
  51. self._max_edge_num = None
  52. self._ave_node_degree = None
  53. self._min_node_degree = None
  54. self._max_node_degree = None
  55. self._ave_fill_factor = None
  56. self._min_fill_factor = None
  57. self._max_fill_factor = None
  58. self._node_label_nums = None
  59. self._edge_label_nums = None
  60. self._node_attr_dim = None
  61. self._edge_attr_dim = None
  62. self._class_number = None
  63. def load_dataset(self, filename, filename_targets=None, clean_labels=True, **kwargs):
  64. self._graphs, self._targets, label_names = DataLoader(filename, filename_targets=filename_targets, **kwargs).data
  65. self._node_labels = label_names['node_labels']
  66. self._node_attrs = label_names['node_attrs']
  67. self._edge_labels = label_names['edge_labels']
  68. self._edge_attrs = label_names['edge_attrs']
  69. if clean_labels:
  70. self.clean_labels()
  71. def load_graphs(self, graphs, targets=None):
  72. # this has to be followed by set_labels().
  73. self._graphs = graphs
  74. self._targets = targets
  75. # self.set_labels_attrs() # @todo
  76. def load_predefined_dataset(self, ds_name, root='datasets', clean_labels=True, reload=False, verbose=False):
  77. path = DataFetcher(name=ds_name, root=root, reload=reload, verbose=verbose).path
  78. if DATASET_META[ds_name]['database'] == 'tudataset':
  79. ds_file = os.path.join(path, ds_name + '_A.txt')
  80. fn_targets = None
  81. else:
  82. load_files = DATASET_META[ds_name]['load_files']
  83. ds_file = os.path.join(path, load_files[0])
  84. fn_targets = os.path.join(path, load_files[1]) if len(load_files) == 2 else None
  85. self._graphs, self._targets, label_names = DataLoader(ds_file, filename_targets=fn_targets).data
  86. self._node_labels = label_names['node_labels']
  87. self._node_attrs = label_names['node_attrs']
  88. self._edge_labels = label_names['edge_labels']
  89. self._edge_attrs = label_names['edge_attrs']
  90. if clean_labels:
  91. self.clean_labels()
  92. def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]):
  93. self._node_labels = node_labels
  94. self._node_attrs = node_attrs
  95. self._edge_labels = edge_labels
  96. self._edge_attrs = edge_attrs
  97. def set_labels_attrs(self, node_labels=None, node_attrs=None, edge_labels=None, edge_attrs=None):
  98. # @todo: remove labels which have only one possible values.
  99. if node_labels is None:
  100. self._node_labels = self._graphs[0].graph['node_labels']
  101. # # graphs are considered node unlabeled if all nodes have the same label.
  102. # infos.update({'node_labeled': is_nl if node_label_num > 1 else False})
  103. if node_attrs is None:
  104. self._node_attrs = self._graphs[0].graph['node_attrs']
  105. # for G in Gn:
  106. # for n in G.nodes(data=True):
  107. # if 'attributes' in n[1]:
  108. # return len(n[1]['attributes'])
  109. # return 0
  110. if edge_labels is None:
  111. self._edge_labels = self._graphs[0].graph['edge_labels']
  112. # # graphs are considered edge unlabeled if all edges have the same label.
  113. # infos.update({'edge_labeled': is_el if edge_label_num > 1 else False})
  114. if edge_attrs is None:
  115. self._edge_attrs = self._graphs[0].graph['edge_attrs']
  116. # for G in Gn:
  117. # if nx.number_of_edges(G) > 0:
  118. # for e in G.edges(data=True):
  119. # if 'attributes' in e[2]:
  120. # return len(e[2]['attributes'])
  121. # return 0
  122. def get_dataset_infos(self, keys=None, params=None):
  123. """Computes and returns the structure and property information of the graph dataset.
  124. Parameters
  125. ----------
  126. keys : list, optional
  127. A list of strings which indicate which informations will be returned. The
  128. possible choices includes:
  129. 'substructures': sub-structures graphs contains, including 'linear', 'non
  130. linear' and 'cyclic'.
  131. 'node_label_dim': whether vertices have symbolic labels.
  132. 'edge_label_dim': whether egdes have symbolic labels.
  133. 'directed': whether graphs in dataset are directed.
  134. 'dataset_size': number of graphs in dataset.
  135. 'total_node_num': total number of vertices of all graphs in dataset.
  136. 'ave_node_num': average number of vertices of graphs in dataset.
  137. 'min_node_num': minimum number of vertices of graphs in dataset.
  138. 'max_node_num': maximum number of vertices of graphs in dataset.
  139. 'total_edge_num': total number of edges of all graphs in dataset.
  140. 'ave_edge_num': average number of edges of graphs in dataset.
  141. 'min_edge_num': minimum number of edges of graphs in dataset.
  142. 'max_edge_num': maximum number of edges of graphs in dataset.
  143. 'ave_node_degree': average vertex degree of graphs in dataset.
  144. 'min_node_degree': minimum vertex degree of graphs in dataset.
  145. 'max_node_degree': maximum vertex degree of graphs in dataset.
  146. 'ave_fill_factor': average fill factor (number_of_edges /
  147. (number_of_nodes ** 2)) of graphs in dataset.
  148. 'min_fill_factor': minimum fill factor of graphs in dataset.
  149. 'max_fill_factor': maximum fill factor of graphs in dataset.
  150. 'node_label_nums': list of numbers of symbolic vertex labels of graphs in dataset.
  151. 'edge_label_nums': list number of symbolic edge labels of graphs in dataset.
  152. 'node_attr_dim': number of dimensions of non-symbolic vertex labels.
  153. Extracted from the 'attributes' attribute of graph nodes.
  154. 'edge_attr_dim': number of dimensions of non-symbolic edge labels.
  155. Extracted from the 'attributes' attribute of graph edges.
  156. 'class_number': number of classes. Only available for classification problems.
  157. 'all_degree_entropy': the entropy of degree distribution of each graph.
  158. 'ave_degree_entropy': the average entropy of degree distribution of all graphs.
  159. All informations above will be returned if `keys` is not given.
  160. params: dict of dict, optional
  161. A dictinary which contains extra parameters for each possible
  162. element in ``keys``.
  163. Return
  164. ------
  165. dict
  166. Information of the graph dataset keyed by `keys`.
  167. """
  168. infos = {}
  169. if keys == None:
  170. keys = [
  171. 'substructures',
  172. 'node_label_dim',
  173. 'edge_label_dim',
  174. 'directed',
  175. 'dataset_size',
  176. 'total_node_num',
  177. 'ave_node_num',
  178. 'min_node_num',
  179. 'max_node_num',
  180. 'total_edge_num',
  181. 'ave_edge_num',
  182. 'min_edge_num',
  183. 'max_edge_num',
  184. 'ave_node_degree',
  185. 'min_node_degree',
  186. 'max_node_degree',
  187. 'ave_fill_factor',
  188. 'min_fill_factor',
  189. 'max_fill_factor',
  190. 'node_label_nums',
  191. 'edge_label_nums',
  192. 'node_attr_dim',
  193. 'edge_attr_dim',
  194. 'class_number',
  195. 'all_degree_entropy',
  196. 'ave_degree_entropy'
  197. ]
  198. # dataset size
  199. if 'dataset_size' in keys:
  200. if self._dataset_size is None:
  201. self._dataset_size = self._get_dataset_size()
  202. infos['dataset_size'] = self._dataset_size
  203. # graph node number
  204. if any(i in keys for i in ['total_node_num', 'ave_node_num', 'min_node_num', 'max_node_num']):
  205. all_node_nums = self._get_all_node_nums()
  206. if 'total_node_num' in keys:
  207. if self._total_node_num is None:
  208. self._total_node_num = self._get_total_node_num(all_node_nums)
  209. infos['total_node_num'] = self._total_node_num
  210. if 'ave_node_num' in keys:
  211. if self._ave_node_num is None:
  212. self._ave_node_num = self._get_ave_node_num(all_node_nums)
  213. infos['ave_node_num'] = self._ave_node_num
  214. if 'min_node_num' in keys:
  215. if self._min_node_num is None:
  216. self._min_node_num = self._get_min_node_num(all_node_nums)
  217. infos['min_node_num'] = self._min_node_num
  218. if 'max_node_num' in keys:
  219. if self._max_node_num is None:
  220. self._max_node_num = self._get_max_node_num(all_node_nums)
  221. infos['max_node_num'] = self._max_node_num
  222. # graph edge number
  223. if any(i in keys for i in ['total_edge_num', 'ave_edge_num', 'min_edge_num', 'max_edge_num']):
  224. all_edge_nums = self._get_all_edge_nums()
  225. if 'total_edge_num' in keys:
  226. if self._total_edge_num is None:
  227. self._total_edge_num = self._get_total_edge_num(all_edge_nums)
  228. infos['total_edge_num'] = self._total_edge_num
  229. if 'ave_edge_num' in keys:
  230. if self._ave_edge_num is None:
  231. self._ave_edge_num = self._get_ave_edge_num(all_edge_nums)
  232. infos['ave_edge_num'] = self._ave_edge_num
  233. if 'max_edge_num' in keys:
  234. if self._max_edge_num is None:
  235. self._max_edge_num = self._get_max_edge_num(all_edge_nums)
  236. infos['max_edge_num'] = self._max_edge_num
  237. if 'min_edge_num' in keys:
  238. if self._min_edge_num is None:
  239. self._min_edge_num = self._get_min_edge_num(all_edge_nums)
  240. infos['min_edge_num'] = self._min_edge_num
  241. # label number
  242. if 'node_label_dim' in keys:
  243. if self._node_label_dim is None:
  244. self._node_label_dim = self._get_node_label_dim()
  245. infos['node_label_dim'] = self._node_label_dim
  246. if 'node_label_nums' in keys:
  247. if self._node_label_nums is None:
  248. self._node_label_nums = {}
  249. for node_label in self._node_labels:
  250. self._node_label_nums[node_label] = self._get_node_label_num(node_label)
  251. infos['node_label_nums'] = self._node_label_nums
  252. if 'edge_label_dim' in keys:
  253. if self._edge_label_dim is None:
  254. self._edge_label_dim = self._get_edge_label_dim()
  255. infos['edge_label_dim'] = self._edge_label_dim
  256. if 'edge_label_nums' in keys:
  257. if self._edge_label_nums is None:
  258. self._edge_label_nums = {}
  259. for edge_label in self._edge_labels:
  260. self._edge_label_nums[edge_label] = self._get_edge_label_num(edge_label)
  261. infos['edge_label_nums'] = self._edge_label_nums
  262. if 'directed' in keys or 'substructures' in keys:
  263. if self._directed is None:
  264. self._directed = self._is_directed()
  265. infos['directed'] = self._directed
  266. # node degree
  267. if any(i in keys for i in ['ave_node_degree', 'max_node_degree', 'min_node_degree']):
  268. all_node_degrees = self._get_all_node_degrees()
  269. if 'ave_node_degree' in keys:
  270. if self._ave_node_degree is None:
  271. self._ave_node_degree = self._get_ave_node_degree(all_node_degrees)
  272. infos['ave_node_degree'] = self._ave_node_degree
  273. if 'max_node_degree' in keys:
  274. if self._max_node_degree is None:
  275. self._max_node_degree = self._get_max_node_degree(all_node_degrees)
  276. infos['max_node_degree'] = self._max_node_degree
  277. if 'min_node_degree' in keys:
  278. if self._min_node_degree is None:
  279. self._min_node_degree = self._get_min_node_degree(all_node_degrees)
  280. infos['min_node_degree'] = self._min_node_degree
  281. # fill factor
  282. if any(i in keys for i in ['ave_fill_factor', 'max_fill_factor', 'min_fill_factor']):
  283. all_fill_factors = self._get_all_fill_factors()
  284. if 'ave_fill_factor' in keys:
  285. if self._ave_fill_factor is None:
  286. self._ave_fill_factor = self._get_ave_fill_factor(all_fill_factors)
  287. infos['ave_fill_factor'] = self._ave_fill_factor
  288. if 'max_fill_factor' in keys:
  289. if self._max_fill_factor is None:
  290. self._max_fill_factor = self._get_max_fill_factor(all_fill_factors)
  291. infos['max_fill_factor'] = self._max_fill_factor
  292. if 'min_fill_factor' in keys:
  293. if self._min_fill_factor is None:
  294. self._min_fill_factor = self._get_min_fill_factor(all_fill_factors)
  295. infos['min_fill_factor'] = self._min_fill_factor
  296. if 'substructures' in keys:
  297. if self._substructures is None:
  298. self._substructures = self._get_substructures()
  299. infos['substructures'] = self._substructures
  300. if 'class_number' in keys:
  301. if self._class_number is None:
  302. self._class_number = self._get_class_number()
  303. infos['class_number'] = self._class_number
  304. if 'node_attr_dim' in keys:
  305. if self._node_attr_dim is None:
  306. self._node_attr_dim = self._get_node_attr_dim()
  307. infos['node_attr_dim'] = self._node_attr_dim
  308. if 'edge_attr_dim' in keys:
  309. if self._edge_attr_dim is None:
  310. self._edge_attr_dim = self._get_edge_attr_dim()
  311. infos['edge_attr_dim'] = self._edge_attr_dim
  312. # entropy of degree distribution.
  313. if 'all_degree_entropy' in keys:
  314. if params is not None and ('all_degree_entropy' in params) and ('base' in params['all_degree_entropy']):
  315. base = params['all_degree_entropy']['base']
  316. else:
  317. base = None
  318. infos['all_degree_entropy'] = self._compute_all_degree_entropy(base=base)
  319. if 'ave_degree_entropy' in keys:
  320. if params is not None and ('ave_degree_entropy' in params) and ('base' in params['ave_degree_entropy']):
  321. base = params['ave_degree_entropy']['base']
  322. else:
  323. base = None
  324. infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base))
  325. return infos
  326. def print_graph_infos(self, infos):
  327. from collections import OrderedDict
  328. keys = list(infos.keys())
  329. print(OrderedDict(sorted(infos.items(), key=lambda i: keys.index(i[0]))))
  330. def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]):
  331. node_labels = [item for item in node_labels if item in self._node_labels]
  332. edge_labels = [item for item in edge_labels if item in self._edge_labels]
  333. node_attrs = [item for item in node_attrs if item in self._node_attrs]
  334. edge_attrs = [item for item in edge_attrs if item in self._edge_attrs]
  335. for g in self._graphs:
  336. for nd in g.nodes():
  337. for nl in node_labels:
  338. del g.nodes[nd][nl]
  339. for na in node_attrs:
  340. del g.nodes[nd][na]
  341. for ed in g.edges():
  342. for el in edge_labels:
  343. del g.edges[ed][el]
  344. for ea in edge_attrs:
  345. del g.edges[ed][ea]
  346. if len(node_labels) > 0:
  347. self._node_labels = [nl for nl in self._node_labels if nl not in node_labels]
  348. if len(edge_labels) > 0:
  349. self._edge_labels = [el for el in self._edge_labels if el not in edge_labels]
  350. if len(node_attrs) > 0:
  351. self._node_attrs = [na for na in self._node_attrs if na not in node_attrs]
  352. if len(edge_attrs) > 0:
  353. self._edge_attrs = [ea for ea in self._edge_attrs if ea not in edge_attrs]
  354. def clean_labels(self):
  355. labels = []
  356. for name in self._node_labels:
  357. label = set()
  358. for G in self._graphs:
  359. label = label | set(nx.get_node_attributes(G, name).values())
  360. if len(label) > 1:
  361. labels.append(name)
  362. break
  363. if len(label) < 2:
  364. for G in self._graphs:
  365. for nd in G.nodes():
  366. del G.nodes[nd][name]
  367. self._node_labels = labels
  368. labels = []
  369. for name in self._edge_labels:
  370. label = set()
  371. for G in self._graphs:
  372. label = label | set(nx.get_edge_attributes(G, name).values())
  373. if len(label) > 1:
  374. labels.append(name)
  375. break
  376. if len(label) < 2:
  377. for G in self._graphs:
  378. for ed in G.edges():
  379. del G.edges[ed][name]
  380. self._edge_labels = labels
  381. labels = []
  382. for name in self._node_attrs:
  383. label = set()
  384. for G in self._graphs:
  385. label = label | set(nx.get_node_attributes(G, name).values())
  386. if len(label) > 1:
  387. labels.append(name)
  388. break
  389. if len(label) < 2:
  390. for G in self._graphs:
  391. for nd in G.nodes():
  392. del G.nodes[nd][name]
  393. self._node_attrs = labels
  394. labels = []
  395. for name in self._edge_attrs:
  396. label = set()
  397. for G in self._graphs:
  398. label = label | set(nx.get_edge_attributes(G, name).values())
  399. if len(label) > 1:
  400. labels.append(name)
  401. break
  402. if len(label) < 2:
  403. for G in self._graphs:
  404. for ed in G.edges():
  405. del G.edges[ed][name]
  406. self._edge_attrs = labels
  407. def cut_graphs(self, range_):
  408. self._graphs = [self._graphs[i] for i in range_]
  409. if self._targets is not None:
  410. self._targets = [self._targets[i] for i in range_]
  411. self.clean_labels()
  412. def trim_dataset(self, edge_required=False):
  413. if edge_required:
  414. trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if (nx.number_of_nodes(g) != 0 and nx.number_of_edges(g) != 0)]
  415. else:
  416. trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if nx.number_of_nodes(g) != 0]
  417. idx = [p[0] for p in trimed_pairs]
  418. self._graphs = [p[1] for p in trimed_pairs]
  419. self._targets = [self._targets[i] for i in idx]
  420. self.clean_labels()
  421. def copy(self):
  422. dataset = Dataset()
  423. graphs = [g.copy() for g in self._graphs] if self._graphs is not None else None
  424. target = self._targets.copy() if self._targets is not None else None
  425. node_labels = self._node_labels.copy() if self._node_labels is not None else None
  426. node_attrs = self._node_attrs.copy() if self._node_attrs is not None else None
  427. edge_labels = self._edge_labels.copy() if self._edge_labels is not None else None
  428. edge_attrs = self._edge_attrs.copy() if self._edge_attrs is not None else None
  429. dataset.load_graphs(graphs, target)
  430. dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs)
  431. # @todo: clean_labels and add other class members?
  432. return dataset
  433. def get_all_node_labels(self):
  434. node_labels = []
  435. for g in self._graphs:
  436. for n in g.nodes():
  437. nl = tuple(g.nodes[n].items())
  438. if nl not in node_labels:
  439. node_labels.append(nl)
  440. return node_labels
  441. def get_all_edge_labels(self):
  442. edge_labels = []
  443. for g in self._graphs:
  444. for e in g.edges():
  445. el = tuple(g.edges[e].items())
  446. if el not in edge_labels:
  447. edge_labels.append(el)
  448. return edge_labels
  449. def _get_dataset_size(self):
  450. return len(self._graphs)
  451. def _get_all_node_nums(self):
  452. return [nx.number_of_nodes(G) for G in self._graphs]
  453. def _get_total_node_nums(self, all_node_nums):
  454. return np.sum(all_node_nums)
  455. def _get_ave_node_num(self, all_node_nums):
  456. return np.mean(all_node_nums)
  457. def _get_min_node_num(self, all_node_nums):
  458. return np.amin(all_node_nums)
  459. def _get_max_node_num(self, all_node_nums):
  460. return np.amax(all_node_nums)
  461. def _get_all_edge_nums(self):
  462. return [nx.number_of_edges(G) for G in self._graphs]
  463. def _get_total_edge_nums(self, all_edge_nums):
  464. return np.sum(all_edge_nums)
  465. def _get_ave_edge_num(self, all_edge_nums):
  466. return np.mean(all_edge_nums)
  467. def _get_min_edge_num(self, all_edge_nums):
  468. return np.amin(all_edge_nums)
  469. def _get_max_edge_num(self, all_edge_nums):
  470. return np.amax(all_edge_nums)
  471. def _get_node_label_dim(self):
  472. return len(self._node_labels)
  473. def _get_node_label_num(self, node_label):
  474. nl = set()
  475. for G in self._graphs:
  476. nl = nl | set(nx.get_node_attributes(G, node_label).values())
  477. return len(nl)
  478. def _get_edge_label_dim(self):
  479. return len(self._edge_labels)
  480. def _get_edge_label_num(self, edge_label):
  481. el = set()
  482. for G in self._graphs:
  483. el = el | set(nx.get_edge_attributes(G, edge_label).values())
  484. return len(el)
  485. def _is_directed(self):
  486. return nx.is_directed(self._graphs[0])
  487. def _get_all_node_degrees(self):
  488. return [np.mean(list(dict(G.degree()).values())) for G in self._graphs]
  489. def _get_ave_node_degree(self, all_node_degrees):
  490. return np.mean(all_node_degrees)
  491. def _get_max_node_degree(self, all_node_degrees):
  492. return np.amax(all_node_degrees)
  493. def _get_min_node_degree(self, all_node_degrees):
  494. return np.amin(all_node_degrees)
  495. def _get_all_fill_factors(self):
  496. """Get fill factor, the number of non-zero entries in the adjacency matrix.
  497. Returns
  498. -------
  499. list[float]
  500. List of fill factors for all graphs.
  501. """
  502. return [nx.number_of_edges(G) / (nx.number_of_nodes(G) ** 2) for G in self._graphs]
  503. def _get_ave_fill_factor(self, all_fill_factors):
  504. return np.mean(all_fill_factors)
  505. def _get_max_fill_factor(self, all_fill_factors):
  506. return np.amax(all_fill_factors)
  507. def _get_min_fill_factor(self, all_fill_factors):
  508. return np.amin(all_fill_factors)
  509. def _get_substructures(self):
  510. subs = set()
  511. for G in self._graphs:
  512. degrees = list(dict(G.degree()).values())
  513. if any(i == 2 for i in degrees):
  514. subs.add('linear')
  515. if np.amax(degrees) >= 3:
  516. subs.add('non linear')
  517. if 'linear' in subs and 'non linear' in subs:
  518. break
  519. if self._directed:
  520. for G in self._graphs:
  521. if len(list(nx.find_cycle(G))) > 0:
  522. subs.add('cyclic')
  523. break
  524. # else:
  525. # # @todo: this method does not work for big graph with large amount of edges like D&D, try a better way.
  526. # upper = np.amin([nx.number_of_edges(G) for G in Gn]) * 2 + 10
  527. # for G in Gn:
  528. # if (nx.number_of_edges(G) < upper):
  529. # cyc = list(nx.simple_cycles(G.to_directed()))
  530. # if any(len(i) > 2 for i in cyc):
  531. # subs.add('cyclic')
  532. # break
  533. # if 'cyclic' not in subs:
  534. # for G in Gn:
  535. # cyc = list(nx.simple_cycles(G.to_directed()))
  536. # if any(len(i) > 2 for i in cyc):
  537. # subs.add('cyclic')
  538. # break
  539. return subs
  540. def _get_class_num(self):
  541. return len(set(self._targets))
  542. def _get_node_attr_dim(self):
  543. return len(self._node_attrs)
  544. def _get_edge_attr_dim(self):
  545. return len(self._edge_attrs)
  546. def _compute_all_degree_entropy(self, base=None):
  547. """Compute the entropy of degree distribution of each graph.
  548. Parameters
  549. ----------
  550. base : float, optional
  551. The logarithmic base to use. The default is ``e`` (natural logarithm).
  552. Returns
  553. -------
  554. degree_entropy : float
  555. The calculated entropy.
  556. """
  557. from gklearn.utils.stats import entropy
  558. degree_entropy = []
  559. for g in self._graphs:
  560. degrees = list(dict(g.degree()).values())
  561. en = entropy(degrees, base=base)
  562. degree_entropy.append(en)
  563. return degree_entropy
  564. @property
  565. def graphs(self):
  566. return self._graphs
  567. @property
  568. def targets(self):
  569. return self._targets
  570. @property
  571. def node_labels(self):
  572. return self._node_labels
  573. @property
  574. def edge_labels(self):
  575. return self._edge_labels
  576. @property
  577. def node_attrs(self):
  578. return self._node_attrs
  579. @property
  580. def edge_attrs(self):
  581. return self._edge_attrs
  582. def split_dataset_by_target(dataset):
  583. from gklearn.preimage.utils import get_same_item_indices
  584. graphs = dataset.graphs
  585. targets = dataset.targets
  586. datasets = []
  587. idx_targets = get_same_item_indices(targets)
  588. for key, val in idx_targets.items():
  589. sub_graphs = [graphs[i] for i in val]
  590. sub_dataset = Dataset()
  591. sub_dataset.load_graphs(sub_graphs, [key] * len(val))
  592. node_labels = dataset.node_labels.copy() if dataset.node_labels is not None else None
  593. node_attrs = dataset.node_attrs.copy() if dataset.node_attrs is not None else None
  594. edge_labels = dataset.edge_labels.copy() if dataset.edge_labels is not None else None
  595. edge_attrs = dataset.edge_attrs.copy() if dataset.edge_attrs is not None else None
  596. sub_dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs)
  597. datasets.append(sub_dataset)
  598. # @todo: clean_labels?
  599. return datasets

A Python package for graph kernels, graph edit distances and graph pre-image problem.