You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dataset.py 27 kB

4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Thu Mar 26 18:48:27 2020
  5. @author: ljia
  6. """
  7. import numpy as np
  8. import networkx as nx
  9. from gklearn.utils.graph_files import load_dataset
  10. import os
  11. class Dataset(object):
  12. import warnings
  13. warnings.simplefilter('always', DeprecationWarning)
  14. warnings.warn('This class has been moved to "gklearn.dataset" module. The class "gklearn.utils.dataset.Dataset" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning)
  15. def __init__(self, filename=None, filename_targets=None, **kwargs):
  16. if filename is None:
  17. self._graphs = None
  18. self._targets = None
  19. self._node_labels = None
  20. self._edge_labels = None
  21. self._node_attrs = None
  22. self._edge_attrs = None
  23. else:
  24. self.load_dataset(filename, filename_targets=filename_targets, **kwargs)
  25. self._substructures = None
  26. self._node_label_dim = None
  27. self._edge_label_dim = None
  28. self._directed = None
  29. self._dataset_size = None
  30. self._total_node_num = None
  31. self._ave_node_num = None
  32. self._min_node_num = None
  33. self._max_node_num = None
  34. self._total_edge_num = None
  35. self._ave_edge_num = None
  36. self._min_edge_num = None
  37. self._max_edge_num = None
  38. self._ave_node_degree = None
  39. self._min_node_degree = None
  40. self._max_node_degree = None
  41. self._ave_fill_factor = None
  42. self._min_fill_factor = None
  43. self._max_fill_factor = None
  44. self._node_label_nums = None
  45. self._edge_label_nums = None
  46. self._node_attr_dim = None
  47. self._edge_attr_dim = None
  48. self._class_number = None
  49. def load_dataset(self, filename, filename_targets=None, **kwargs):
  50. self._graphs, self._targets, label_names = load_dataset(filename, filename_targets=filename_targets, **kwargs)
  51. self._node_labels = label_names['node_labels']
  52. self._node_attrs = label_names['node_attrs']
  53. self._edge_labels = label_names['edge_labels']
  54. self._edge_attrs = label_names['edge_attrs']
  55. self.clean_labels()
  56. def load_graphs(self, graphs, targets=None):
  57. # this has to be followed by set_labels().
  58. self._graphs = graphs
  59. self._targets = targets
  60. # self.set_labels_attrs() # @todo
  61. def load_predefined_dataset(self, ds_name):
  62. current_path = os.path.dirname(os.path.realpath(__file__)) + '/'
  63. if ds_name == 'Acyclic':
  64. ds_file = current_path + '../../datasets/Acyclic/dataset_bps.ds'
  65. self._graphs, self._targets, label_names = load_dataset(ds_file)
  66. elif ds_name == 'AIDS':
  67. ds_file = current_path + '../../datasets/AIDS/AIDS_A.txt'
  68. self._graphs, self._targets, label_names = load_dataset(ds_file)
  69. elif ds_name == 'Alkane':
  70. ds_file = current_path + '../../datasets/Alkane/dataset.ds'
  71. fn_targets = current_path + '../../datasets/Alkane/dataset_boiling_point_names.txt'
  72. self._graphs, self._targets, label_names = load_dataset(ds_file, filename_targets=fn_targets)
  73. elif ds_name == 'COIL-DEL':
  74. ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt'
  75. self._graphs, self._targets, label_names = load_dataset(ds_file)
  76. elif ds_name == 'COIL-RAG':
  77. ds_file = current_path + '../../datasets/COIL-RAG/COIL-RAG_A.txt'
  78. self._graphs, self._targets, label_names = load_dataset(ds_file)
  79. elif ds_name == 'COLORS-3':
  80. ds_file = current_path + '../../datasets/COLORS-3/COLORS-3_A.txt'
  81. self._graphs, self._targets, label_names = load_dataset(ds_file)
  82. elif ds_name == 'Cuneiform':
  83. ds_file = current_path + '../../datasets/Cuneiform/Cuneiform_A.txt'
  84. self._graphs, self._targets, label_names = load_dataset(ds_file)
  85. elif ds_name == 'DD':
  86. ds_file = current_path + '../../datasets/DD/DD_A.txt'
  87. self._graphs, self._targets, label_names = load_dataset(ds_file)
  88. elif ds_name == 'ENZYMES':
  89. ds_file = current_path + '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'
  90. self._graphs, self._targets, label_names = load_dataset(ds_file)
  91. elif ds_name == 'Fingerprint':
  92. ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt'
  93. self._graphs, self._targets, label_names = load_dataset(ds_file)
  94. elif ds_name == 'FRANKENSTEIN':
  95. ds_file = current_path + '../../datasets/FRANKENSTEIN/FRANKENSTEIN_A.txt'
  96. self._graphs, self._targets, label_names = load_dataset(ds_file)
  97. elif ds_name == 'Letter-high': # node non-symb
  98. ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt'
  99. self._graphs, self._targets, label_names = load_dataset(ds_file)
  100. elif ds_name == 'Letter-low': # node non-symb
  101. ds_file = current_path + '../../datasets/Letter-low/Letter-low_A.txt'
  102. self._graphs, self._targets, label_names = load_dataset(ds_file)
  103. elif ds_name == 'Letter-med': # node non-symb
  104. ds_file = current_path + '../../datasets/Letter-med/Letter-med_A.txt'
  105. self._graphs, self._targets, label_names = load_dataset(ds_file)
  106. elif ds_name == 'MAO':
  107. ds_file = current_path + '../../datasets/MAO/dataset.ds'
  108. self._graphs, self._targets, label_names = load_dataset(ds_file)
  109. elif ds_name == 'Monoterpenoides':
  110. ds_file = current_path + '../../datasets/Monoterpenoides/dataset_10+.ds'
  111. self._graphs, self._targets, label_names = load_dataset(ds_file)
  112. elif ds_name == 'MUTAG':
  113. ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt'
  114. self._graphs, self._targets, label_names = load_dataset(ds_file)
  115. elif ds_name == 'NCI1':
  116. ds_file = current_path + '../../datasets/NCI1/NCI1_A.txt'
  117. self._graphs, self._targets, label_names = load_dataset(ds_file)
  118. elif ds_name == 'NCI109':
  119. ds_file = current_path + '../../datasets/NCI109/NCI109_A.txt'
  120. self._graphs, self._targets, label_names = load_dataset(ds_file)
  121. elif ds_name == 'PAH':
  122. ds_file = current_path + '../../datasets/PAH/dataset.ds'
  123. self._graphs, self._targets, label_names = load_dataset(ds_file)
  124. elif ds_name == 'SYNTHETIC':
  125. pass
  126. elif ds_name == 'SYNTHETICnew':
  127. ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt'
  128. self._graphs, self._targets, label_names = load_dataset(ds_file)
  129. elif ds_name == 'Synthie':
  130. pass
  131. else:
  132. raise Exception('The dataset name "', ds_name, '" is not pre-defined.')
  133. self._node_labels = label_names['node_labels']
  134. self._node_attrs = label_names['node_attrs']
  135. self._edge_labels = label_names['edge_labels']
  136. self._edge_attrs = label_names['edge_attrs']
  137. self.clean_labels()
  138. def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]):
  139. self._node_labels = node_labels
  140. self._node_attrs = node_attrs
  141. self._edge_labels = edge_labels
  142. self._edge_attrs = edge_attrs
  143. def set_labels_attrs(self, node_labels=None, node_attrs=None, edge_labels=None, edge_attrs=None):
  144. # @todo: remove labels which have only one possible values.
  145. if node_labels is None:
  146. self._node_labels = self._graphs[0].graph['node_labels']
  147. # # graphs are considered node unlabeled if all nodes have the same label.
  148. # infos.update({'node_labeled': is_nl if node_label_num > 1 else False})
  149. if node_attrs is None:
  150. self._node_attrs = self._graphs[0].graph['node_attrs']
  151. # for G in Gn:
  152. # for n in G.nodes(data=True):
  153. # if 'attributes' in n[1]:
  154. # return len(n[1]['attributes'])
  155. # return 0
  156. if edge_labels is None:
  157. self._edge_labels = self._graphs[0].graph['edge_labels']
  158. # # graphs are considered edge unlabeled if all edges have the same label.
  159. # infos.update({'edge_labeled': is_el if edge_label_num > 1 else False})
  160. if edge_attrs is None:
  161. self._edge_attrs = self._graphs[0].graph['edge_attrs']
  162. # for G in Gn:
  163. # if nx.number_of_edges(G) > 0:
  164. # for e in G.edges(data=True):
  165. # if 'attributes' in e[2]:
  166. # return len(e[2]['attributes'])
  167. # return 0
  168. def get_dataset_infos(self, keys=None, params=None):
  169. """Computes and returns the structure and property information of the graph dataset.
  170. Parameters
  171. ----------
  172. keys : list, optional
  173. A list of strings which indicate which informations will be returned. The
  174. possible choices includes:
  175. 'substructures': sub-structures graphs contains, including 'linear', 'non
  176. linear' and 'cyclic'.
  177. 'node_label_dim': whether vertices have symbolic labels.
  178. 'edge_label_dim': whether egdes have symbolic labels.
  179. 'directed': whether graphs in dataset are directed.
  180. 'dataset_size': number of graphs in dataset.
  181. 'total_node_num': total number of vertices of all graphs in dataset.
  182. 'ave_node_num': average number of vertices of graphs in dataset.
  183. 'min_node_num': minimum number of vertices of graphs in dataset.
  184. 'max_node_num': maximum number of vertices of graphs in dataset.
  185. 'total_edge_num': total number of edges of all graphs in dataset.
  186. 'ave_edge_num': average number of edges of graphs in dataset.
  187. 'min_edge_num': minimum number of edges of graphs in dataset.
  188. 'max_edge_num': maximum number of edges of graphs in dataset.
  189. 'ave_node_degree': average vertex degree of graphs in dataset.
  190. 'min_node_degree': minimum vertex degree of graphs in dataset.
  191. 'max_node_degree': maximum vertex degree of graphs in dataset.
  192. 'ave_fill_factor': average fill factor (number_of_edges /
  193. (number_of_nodes ** 2)) of graphs in dataset.
  194. 'min_fill_factor': minimum fill factor of graphs in dataset.
  195. 'max_fill_factor': maximum fill factor of graphs in dataset.
  196. 'node_label_nums': list of numbers of symbolic vertex labels of graphs in dataset.
  197. 'edge_label_nums': list number of symbolic edge labels of graphs in dataset.
  198. 'node_attr_dim': number of dimensions of non-symbolic vertex labels.
  199. Extracted from the 'attributes' attribute of graph nodes.
  200. 'edge_attr_dim': number of dimensions of non-symbolic edge labels.
  201. Extracted from the 'attributes' attribute of graph edges.
  202. 'class_number': number of classes. Only available for classification problems.
  203. 'all_degree_entropy': the entropy of degree distribution of each graph.
  204. 'ave_degree_entropy': the average entropy of degree distribution of all graphs.
  205. All informations above will be returned if `keys` is not given.
  206. params: dict of dict, optional
  207. A dictinary which contains extra parameters for each possible
  208. element in ``keys``.
  209. Return
  210. ------
  211. dict
  212. Information of the graph dataset keyed by `keys`.
  213. """
  214. infos = {}
  215. if keys == None:
  216. keys = [
  217. 'substructures',
  218. 'node_label_dim',
  219. 'edge_label_dim',
  220. 'directed',
  221. 'dataset_size',
  222. 'total_node_num',
  223. 'ave_node_num',
  224. 'min_node_num',
  225. 'max_node_num',
  226. 'total_edge_num',
  227. 'ave_edge_num',
  228. 'min_edge_num',
  229. 'max_edge_num',
  230. 'ave_node_degree',
  231. 'min_node_degree',
  232. 'max_node_degree',
  233. 'ave_fill_factor',
  234. 'min_fill_factor',
  235. 'max_fill_factor',
  236. 'node_label_nums',
  237. 'edge_label_nums',
  238. 'node_attr_dim',
  239. 'edge_attr_dim',
  240. 'class_number',
  241. 'all_degree_entropy',
  242. 'ave_degree_entropy'
  243. ]
  244. # dataset size
  245. if 'dataset_size' in keys:
  246. if self._dataset_size is None:
  247. self._dataset_size = self._get_dataset_size()
  248. infos['dataset_size'] = self._dataset_size
  249. # graph node number
  250. if any(i in keys for i in ['total_node_num', 'ave_node_num', 'min_node_num', 'max_node_num']):
  251. all_node_nums = self._get_all_node_nums()
  252. if 'total_node_num' in keys:
  253. if self._total_node_num is None:
  254. self._total_node_num = self._get_total_node_num(all_node_nums)
  255. infos['total_node_num'] = self._total_node_num
  256. if 'ave_node_num' in keys:
  257. if self._ave_node_num is None:
  258. self._ave_node_num = self._get_ave_node_num(all_node_nums)
  259. infos['ave_node_num'] = self._ave_node_num
  260. if 'min_node_num' in keys:
  261. if self._min_node_num is None:
  262. self._min_node_num = self._get_min_node_num(all_node_nums)
  263. infos['min_node_num'] = self._min_node_num
  264. if 'max_node_num' in keys:
  265. if self._max_node_num is None:
  266. self._max_node_num = self._get_max_node_num(all_node_nums)
  267. infos['max_node_num'] = self._max_node_num
  268. # graph edge number
  269. if any(i in keys for i in ['total_edge_num', 'ave_edge_num', 'min_edge_num', 'max_edge_num']):
  270. all_edge_nums = self._get_all_edge_nums()
  271. if 'total_edge_num' in keys:
  272. if self._total_edge_num is None:
  273. self._total_edge_num = self._get_total_edge_num(all_edge_nums)
  274. infos['total_edge_num'] = self._total_edge_num
  275. if 'ave_edge_num' in keys:
  276. if self._ave_edge_num is None:
  277. self._ave_edge_num = self._get_ave_edge_num(all_edge_nums)
  278. infos['ave_edge_num'] = self._ave_edge_num
  279. if 'max_edge_num' in keys:
  280. if self._max_edge_num is None:
  281. self._max_edge_num = self._get_max_edge_num(all_edge_nums)
  282. infos['max_edge_num'] = self._max_edge_num
  283. if 'min_edge_num' in keys:
  284. if self._min_edge_num is None:
  285. self._min_edge_num = self._get_min_edge_num(all_edge_nums)
  286. infos['min_edge_num'] = self._min_edge_num
  287. # label number
  288. if 'node_label_dim' in keys:
  289. if self._node_label_dim is None:
  290. self._node_label_dim = self._get_node_label_dim()
  291. infos['node_label_dim'] = self._node_label_dim
  292. if 'node_label_nums' in keys:
  293. if self._node_label_nums is None:
  294. self._node_label_nums = {}
  295. for node_label in self._node_labels:
  296. self._node_label_nums[node_label] = self._get_node_label_num(node_label)
  297. infos['node_label_nums'] = self._node_label_nums
  298. if 'edge_label_dim' in keys:
  299. if self._edge_label_dim is None:
  300. self._edge_label_dim = self._get_edge_label_dim()
  301. infos['edge_label_dim'] = self._edge_label_dim
  302. if 'edge_label_nums' in keys:
  303. if self._edge_label_nums is None:
  304. self._edge_label_nums = {}
  305. for edge_label in self._edge_labels:
  306. self._edge_label_nums[edge_label] = self._get_edge_label_num(edge_label)
  307. infos['edge_label_nums'] = self._edge_label_nums
  308. if 'directed' in keys or 'substructures' in keys:
  309. if self._directed is None:
  310. self._directed = self._is_directed()
  311. infos['directed'] = self._directed
  312. # node degree
  313. if any(i in keys for i in ['ave_node_degree', 'max_node_degree', 'min_node_degree']):
  314. all_node_degrees = self._get_all_node_degrees()
  315. if 'ave_node_degree' in keys:
  316. if self._ave_node_degree is None:
  317. self._ave_node_degree = self._get_ave_node_degree(all_node_degrees)
  318. infos['ave_node_degree'] = self._ave_node_degree
  319. if 'max_node_degree' in keys:
  320. if self._max_node_degree is None:
  321. self._max_node_degree = self._get_max_node_degree(all_node_degrees)
  322. infos['max_node_degree'] = self._max_node_degree
  323. if 'min_node_degree' in keys:
  324. if self._min_node_degree is None:
  325. self._min_node_degree = self._get_min_node_degree(all_node_degrees)
  326. infos['min_node_degree'] = self._min_node_degree
  327. # fill factor
  328. if any(i in keys for i in ['ave_fill_factor', 'max_fill_factor', 'min_fill_factor']):
  329. all_fill_factors = self._get_all_fill_factors()
  330. if 'ave_fill_factor' in keys:
  331. if self._ave_fill_factor is None:
  332. self._ave_fill_factor = self._get_ave_fill_factor(all_fill_factors)
  333. infos['ave_fill_factor'] = self._ave_fill_factor
  334. if 'max_fill_factor' in keys:
  335. if self._max_fill_factor is None:
  336. self._max_fill_factor = self._get_max_fill_factor(all_fill_factors)
  337. infos['max_fill_factor'] = self._max_fill_factor
  338. if 'min_fill_factor' in keys:
  339. if self._min_fill_factor is None:
  340. self._min_fill_factor = self._get_min_fill_factor(all_fill_factors)
  341. infos['min_fill_factor'] = self._min_fill_factor
  342. if 'substructures' in keys:
  343. if self._substructures is None:
  344. self._substructures = self._get_substructures()
  345. infos['substructures'] = self._substructures
  346. if 'class_number' in keys:
  347. if self._class_number is None:
  348. self._class_number = self._get_class_number()
  349. infos['class_number'] = self._class_number
  350. if 'node_attr_dim' in keys:
  351. if self._node_attr_dim is None:
  352. self._node_attr_dim = self._get_node_attr_dim()
  353. infos['node_attr_dim'] = self._node_attr_dim
  354. if 'edge_attr_dim' in keys:
  355. if self._edge_attr_dim is None:
  356. self._edge_attr_dim = self._get_edge_attr_dim()
  357. infos['edge_attr_dim'] = self._edge_attr_dim
  358. # entropy of degree distribution.
  359. if 'all_degree_entropy' in keys:
  360. if params is not None and ('all_degree_entropy' in params) and ('base' in params['all_degree_entropy']):
  361. base = params['all_degree_entropy']['base']
  362. else:
  363. base = None
  364. infos['all_degree_entropy'] = self._compute_all_degree_entropy(base=base)
  365. if 'ave_degree_entropy' in keys:
  366. if params is not None and ('ave_degree_entropy' in params) and ('base' in params['ave_degree_entropy']):
  367. base = params['ave_degree_entropy']['base']
  368. else:
  369. base = None
  370. infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base))
  371. return infos
  372. def print_graph_infos(self, infos):
  373. from collections import OrderedDict
  374. keys = list(infos.keys())
  375. print(OrderedDict(sorted(infos.items(), key=lambda i: keys.index(i[0]))))
  376. def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]):
  377. node_labels = [item for item in node_labels if item in self._node_labels]
  378. edge_labels = [item for item in edge_labels if item in self._edge_labels]
  379. node_attrs = [item for item in node_attrs if item in self._node_attrs]
  380. edge_attrs = [item for item in edge_attrs if item in self._edge_attrs]
  381. for g in self._graphs:
  382. for nd in g.nodes():
  383. for nl in node_labels:
  384. del g.nodes[nd][nl]
  385. for na in node_attrs:
  386. del g.nodes[nd][na]
  387. for ed in g.edges():
  388. for el in edge_labels:
  389. del g.edges[ed][el]
  390. for ea in edge_attrs:
  391. del g.edges[ed][ea]
  392. if len(node_labels) > 0:
  393. self._node_labels = [nl for nl in self._node_labels if nl not in node_labels]
  394. if len(edge_labels) > 0:
  395. self._edge_labels = [el for el in self._edge_labels if el not in edge_labels]
  396. if len(node_attrs) > 0:
  397. self._node_attrs = [na for na in self._node_attrs if na not in node_attrs]
  398. if len(edge_attrs) > 0:
  399. self._edge_attrs = [ea for ea in self._edge_attrs if ea not in edge_attrs]
  400. def clean_labels(self):
  401. labels = []
  402. for name in self._node_labels:
  403. label = set()
  404. for G in self._graphs:
  405. label = label | set(nx.get_node_attributes(G, name).values())
  406. if len(label) > 1:
  407. labels.append(name)
  408. break
  409. if len(label) < 2:
  410. for G in self._graphs:
  411. for nd in G.nodes():
  412. del G.nodes[nd][name]
  413. self._node_labels = labels
  414. labels = []
  415. for name in self._edge_labels:
  416. label = set()
  417. for G in self._graphs:
  418. label = label | set(nx.get_edge_attributes(G, name).values())
  419. if len(label) > 1:
  420. labels.append(name)
  421. break
  422. if len(label) < 2:
  423. for G in self._graphs:
  424. for ed in G.edges():
  425. del G.edges[ed][name]
  426. self._edge_labels = labels
  427. labels = []
  428. for name in self._node_attrs:
  429. label = set()
  430. for G in self._graphs:
  431. label = label | set(nx.get_node_attributes(G, name).values())
  432. if len(label) > 1:
  433. labels.append(name)
  434. break
  435. if len(label) < 2:
  436. for G in self._graphs:
  437. for nd in G.nodes():
  438. del G.nodes[nd][name]
  439. self._node_attrs = labels
  440. labels = []
  441. for name in self._edge_attrs:
  442. label = set()
  443. for G in self._graphs:
  444. label = label | set(nx.get_edge_attributes(G, name).values())
  445. if len(label) > 1:
  446. labels.append(name)
  447. break
  448. if len(label) < 2:
  449. for G in self._graphs:
  450. for ed in G.edges():
  451. del G.edges[ed][name]
  452. self._edge_attrs = labels
  453. def cut_graphs(self, range_):
  454. self._graphs = [self._graphs[i] for i in range_]
  455. if self._targets is not None:
  456. self._targets = [self._targets[i] for i in range_]
  457. self.clean_labels()
  458. def trim_dataset(self, edge_required=False):
  459. if edge_required:
  460. trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if (nx.number_of_nodes(g) != 0 and nx.number_of_edges(g) != 0)]
  461. else:
  462. trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if nx.number_of_nodes(g) != 0]
  463. idx = [p[0] for p in trimed_pairs]
  464. self._graphs = [p[1] for p in trimed_pairs]
  465. self._targets = [self._targets[i] for i in idx]
  466. self.clean_labels()
  467. def copy(self):
  468. dataset = Dataset()
  469. graphs = [g.copy() for g in self._graphs] if self._graphs is not None else None
  470. target = self._targets.copy() if self._targets is not None else None
  471. node_labels = self._node_labels.copy() if self._node_labels is not None else None
  472. node_attrs = self._node_attrs.copy() if self._node_attrs is not None else None
  473. edge_labels = self._edge_labels.copy() if self._edge_labels is not None else None
  474. edge_attrs = self._edge_attrs.copy() if self._edge_attrs is not None else None
  475. dataset.load_graphs(graphs, target)
  476. dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs)
  477. # @todo: clean_labels and add other class members?
  478. return dataset
  479. def get_all_node_labels(self):
  480. node_labels = []
  481. for g in self._graphs:
  482. for n in g.nodes():
  483. nl = tuple(g.nodes[n].items())
  484. if nl not in node_labels:
  485. node_labels.append(nl)
  486. return node_labels
  487. def get_all_edge_labels(self):
  488. edge_labels = []
  489. for g in self._graphs:
  490. for e in g.edges():
  491. el = tuple(g.edges[e].items())
  492. if el not in edge_labels:
  493. edge_labels.append(el)
  494. return edge_labels
  495. def _get_dataset_size(self):
  496. return len(self._graphs)
  497. def _get_all_node_nums(self):
  498. return [nx.number_of_nodes(G) for G in self._graphs]
  499. def _get_total_node_nums(self, all_node_nums):
  500. return np.sum(all_node_nums)
  501. def _get_ave_node_num(self, all_node_nums):
  502. return np.mean(all_node_nums)
  503. def _get_min_node_num(self, all_node_nums):
  504. return np.amin(all_node_nums)
  505. def _get_max_node_num(self, all_node_nums):
  506. return np.amax(all_node_nums)
  507. def _get_all_edge_nums(self):
  508. return [nx.number_of_edges(G) for G in self._graphs]
  509. def _get_total_edge_nums(self, all_edge_nums):
  510. return np.sum(all_edge_nums)
  511. def _get_ave_edge_num(self, all_edge_nums):
  512. return np.mean(all_edge_nums)
  513. def _get_min_edge_num(self, all_edge_nums):
  514. return np.amin(all_edge_nums)
  515. def _get_max_edge_num(self, all_edge_nums):
  516. return np.amax(all_edge_nums)
  517. def _get_node_label_dim(self):
  518. return len(self._node_labels)
  519. def _get_node_label_num(self, node_label):
  520. nl = set()
  521. for G in self._graphs:
  522. nl = nl | set(nx.get_node_attributes(G, node_label).values())
  523. return len(nl)
  524. def _get_edge_label_dim(self):
  525. return len(self._edge_labels)
  526. def _get_edge_label_num(self, edge_label):
  527. el = set()
  528. for G in self._graphs:
  529. el = el | set(nx.get_edge_attributes(G, edge_label).values())
  530. return len(el)
  531. def _is_directed(self):
  532. return nx.is_directed(self._graphs[0])
  533. def _get_all_node_degrees(self):
  534. return [np.mean(list(dict(G.degree()).values())) for G in self._graphs]
  535. def _get_ave_node_degree(self, all_node_degrees):
  536. return np.mean(all_node_degrees)
  537. def _get_max_node_degree(self, all_node_degrees):
  538. return np.amax(all_node_degrees)
  539. def _get_min_node_degree(self, all_node_degrees):
  540. return np.amin(all_node_degrees)
  541. def _get_all_fill_factors(self):
  542. """Get fill factor, the number of non-zero entries in the adjacency matrix.
  543. Returns
  544. -------
  545. list[float]
  546. List of fill factors for all graphs.
  547. """
  548. return [nx.number_of_edges(G) / (nx.number_of_nodes(G) ** 2) for G in self._graphs]
  549. def _get_ave_fill_factor(self, all_fill_factors):
  550. return np.mean(all_fill_factors)
  551. def _get_max_fill_factor(self, all_fill_factors):
  552. return np.amax(all_fill_factors)
  553. def _get_min_fill_factor(self, all_fill_factors):
  554. return np.amin(all_fill_factors)
  555. def _get_substructures(self):
  556. subs = set()
  557. for G in self._graphs:
  558. degrees = list(dict(G.degree()).values())
  559. if any(i == 2 for i in degrees):
  560. subs.add('linear')
  561. if np.amax(degrees) >= 3:
  562. subs.add('non linear')
  563. if 'linear' in subs and 'non linear' in subs:
  564. break
  565. if self._directed:
  566. for G in self._graphs:
  567. if len(list(nx.find_cycle(G))) > 0:
  568. subs.add('cyclic')
  569. break
  570. # else:
  571. # # @todo: this method does not work for big graph with large amount of edges like D&D, try a better way.
  572. # upper = np.amin([nx.number_of_edges(G) for G in Gn]) * 2 + 10
  573. # for G in Gn:
  574. # if (nx.number_of_edges(G) < upper):
  575. # cyc = list(nx.simple_cycles(G.to_directed()))
  576. # if any(len(i) > 2 for i in cyc):
  577. # subs.add('cyclic')
  578. # break
  579. # if 'cyclic' not in subs:
  580. # for G in Gn:
  581. # cyc = list(nx.simple_cycles(G.to_directed()))
  582. # if any(len(i) > 2 for i in cyc):
  583. # subs.add('cyclic')
  584. # break
  585. return subs
  586. def _get_class_num(self):
  587. return len(set(self._targets))
  588. def _get_node_attr_dim(self):
  589. return len(self._node_attrs)
  590. def _get_edge_attr_dim(self):
  591. return len(self._edge_attrs)
  592. def _compute_all_degree_entropy(self, base=None):
  593. """Compute the entropy of degree distribution of each graph.
  594. Parameters
  595. ----------
  596. base : float, optional
  597. The logarithmic base to use. The default is ``e`` (natural logarithm).
  598. Returns
  599. -------
  600. degree_entropy : float
  601. The calculated entropy.
  602. """
  603. from gklearn.utils.stats import entropy
  604. degree_entropy = []
  605. for g in self._graphs:
  606. degrees = list(dict(g.degree()).values())
  607. en = entropy(degrees, base=base)
  608. degree_entropy.append(en)
  609. return degree_entropy
  610. @property
  611. def graphs(self):
  612. return self._graphs
  613. @property
  614. def targets(self):
  615. return self._targets
  616. @property
  617. def node_labels(self):
  618. return self._node_labels
  619. @property
  620. def edge_labels(self):
  621. return self._edge_labels
  622. @property
  623. def node_attrs(self):
  624. return self._node_attrs
  625. @property
  626. def edge_attrs(self):
  627. return self._edge_attrs
  628. def split_dataset_by_target(dataset):
  629. import warnings
  630. warnings.simplefilter('always', DeprecationWarning)
  631. warnings.warn('This function has been moved to "gklearn.dataset" module. The function "gklearn.utils.dataset.split_dataset_by_target" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning)
  632. from gklearn.preimage.utils import get_same_item_indices
  633. graphs = dataset.graphs
  634. targets = dataset.targets
  635. datasets = []
  636. idx_targets = get_same_item_indices(targets)
  637. for key, val in idx_targets.items():
  638. sub_graphs = [graphs[i] for i in val]
  639. sub_dataset = Dataset()
  640. sub_dataset.load_graphs(sub_graphs, [key] * len(val))
  641. node_labels = dataset.node_labels.copy() if dataset.node_labels is not None else None
  642. node_attrs = dataset.node_attrs.copy() if dataset.node_attrs is not None else None
  643. edge_labels = dataset.edge_labels.copy() if dataset.edge_labels is not None else None
  644. edge_attrs = dataset.edge_attrs.copy() if dataset.edge_attrs is not None else None
  645. sub_dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs)
  646. datasets.append(sub_dataset)
  647. # @todo: clean_labels?
  648. return datasets

A Python package for graph kernels, graph edit distances and graph pre-image problem.