You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

data_fetcher.py 59 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Tue Oct 20 14:25:49 2020
  5. @author:
  6. Paul Zanoncelli, paul.zanoncelli@ecole.ensicaen.fr
  7. Luc Brun luc.brun@ensicaen.fr
  8. Sebastien Bougleux sebastien.bougleux@unicaen.fr
  9. Benoit Gaüzère benoit.gauzere@insa-rouen.fr
  10. Linlin Jia linlin.jia@insa-rouen.fr
  11. """
  12. import os
  13. import os.path as osp
  14. import urllib
  15. import tarfile
  16. from zipfile import ZipFile
  17. # from gklearn.utils.graphfiles import loadDataset
  18. import torch.nn.functional as F
  19. import networkx as nx
  20. import torch
  21. import random
  22. import sys
  23. from lxml import etree
  24. import re
  25. from tqdm import tqdm
  26. from gklearn.dataset import DATABASES, DATASET_META
  27. class DataFetcher():
  28. def __init__(self, name=None, root='datasets', reload=False, verbose=False):
  29. self._name = name
  30. self._root = root
  31. if not osp.exists(self._root):
  32. os.makedirs(self._root)
  33. self._reload = reload
  34. self._verbose = verbose
  35. # self.has_train_valid_test = {
  36. # "Coil_Del" : ('COIL-DEL/data/test.cxl','COIL-DEL/data/train.cxl','COIL-DEL/data/valid.cxl'),
  37. # "Coil_Rag" : ('COIL-RAG/data/test.cxl','COIL-RAG/data/train.cxl','COIL-RAG/data/valid.cxl'),
  38. # "Fingerprint" : ('Fingerprint/data/test.cxl','Fingerprint/data/train.cxl','Fingerprint/data/valid.cxl'),
  39. # # "Grec" : ('GREC/data/test.cxl','GREC/data/train.cxl','GREC/data/valid.cxl'),
  40. # "Letter" : {'HIGH' : ('Letter/HIGH/test.cxl','Letter/HIGH/train.cxl','Letter/HIGH/validation.cxl'),
  41. # 'MED' : ('Letter/MED/test.cxl','Letter/MED/train.cxl','Letter/MED/validation.cxl'),
  42. # 'LOW' : ('Letter/LOW/test.cxl','Letter/LOW/train.cxl','Letter/LOW/validation.cxl')
  43. # },
  44. # "Mutagenicity" : ('Mutagenicity/data/test.cxl','Mutagenicity/data/train.cxl','Mutagenicity/data/validation.cxl'),
  45. # # "Pah" : ['PAH/testset_0.ds','PAH/trainset_0.ds'],
  46. # "Protein" : ('Protein/data/test.cxl','Protein/data/train.cxl','Protein/data/valid.cxl'),
  47. # # "Web" : ('Web/data/test.cxl','Web/data/train.cxl','Web/data/valid.cxl')
  48. # }
  49. if self._name is None:
  50. if self._verbose:
  51. print('No dataset name entered. All possible datasets will be loaded.')
  52. self._name, self._path = [], []
  53. for idx, ds_name in enumerate(DATASET_META):
  54. if self._verbose:
  55. print(str(idx + 1), '/', str(len(DATASET_META)), 'Fetching', ds_name, end='... ')
  56. self._name.append(ds_name)
  57. success = self.write_archive_file(ds_name)
  58. if success:
  59. self._path.append(self.open_files(ds_name))
  60. else:
  61. self._path.append(None)
  62. if self._verbose and self._path[-1] is not None and not self._reload:
  63. print('Fetched.')
  64. if self._verbose:
  65. print('Finished.', str(sum(v is not None for v in self._path)), 'of', str(len(self._path)), 'datasets are successfully fetched.')
  66. elif self._name not in DATASET_META:
  67. message = 'Invalid Dataset name "' + self._name + '".'
  68. message += '\nAvailable datasets are as follows: \n\n'
  69. message += '\n'.join(ds for ds in sorted(DATASET_META))
  70. message += '\n\nFollowing special suffices can be added to the name:'
  71. message += '\n\n' + '\n'.join(['_unlabeled'])
  72. raise ValueError(message)
  73. else:
  74. self.write_archive_file(self._name)
  75. self._path = self.open_files(self._name)
  76. # self.max_for_letter = 0
  77. # if mode == 'Pytorch':
  78. # if self._name in self.data_to_use_in_datasets :
  79. # Gs,y = self.dataset
  80. # inputs,adjs,y = self.from_networkx_to_pytorch(Gs,y)
  81. # #print(inputs,adjs)
  82. # self.pytorch_dataset = inputs,adjs,y
  83. # elif self._name == "Pah":
  84. # self.pytorch_dataset = []
  85. # test,train = self.dataset
  86. # Gs_test,y_test = test
  87. # Gs_train,y_train = train
  88. # self.pytorch_dataset.append(self.from_networkx_to_pytorch(Gs_test,y_test))
  89. # self.pytorch_dataset.append(self.from_networkx_to_pytorch(Gs_train,y_train))
  90. # elif self._name in self.has_train_valid_test:
  91. # self.pytorch_dataset = []
  92. # #[G[e[0]][e[1]]['bond_type'] for e in G.edges()] for G in Gs])
  93. # test,train,valid = self.dataset
  94. # Gs_test,y_test = test
  95. #
  96. # Gs_train,y_train = train
  97. # Gs_valid,y_valid = valid
  98. # self.pytorch_dataset.append(self.from_networkx_to_pytorch(Gs_test,y_test))
  99. # self.pytorch_dataset.append(self.from_networkx_to_pytorch(Gs_train,y_train))
  100. # self.pytorch_dataset.append(self.from_networkx_to_pytorch(Gs_valid,y_valid))
  101. # #############
  102. # """
  103. # for G in Gs :
  104. # for e in G.edges():
  105. # print(G[e[0]])
  106. # """
  107. # ##############
  108. def download_file(self, url):
  109. try :
  110. response = urllib.request.urlopen(url)
  111. except urllib.error.HTTPError:
  112. print('"', url.split('/')[-1], '" is not available or incorrect http link.')
  113. return
  114. except urllib.error.URLError:
  115. print('Network is unreachable.')
  116. return
  117. return response
  118. def write_archive_file(self, ds_name):
  119. path = osp.join(self._root, ds_name)
  120. # filename_dir = osp.join(path,filename)
  121. if not osp.exists(path) or self._reload:
  122. url = DATASET_META[ds_name]['url']
  123. response = self.download_file(url)
  124. if response is None:
  125. return False
  126. os.makedirs(path, exist_ok=True)
  127. with open(os.path.join(path, url.split('/')[-1]), 'wb') as outfile:
  128. outfile.write(response.read())
  129. return True
  130. def open_files(self, ds_name=None):
  131. if ds_name is None:
  132. ds_name = (self._name if isinstance(self._name, str) else self._name[0])
  133. filename = DATASET_META[ds_name]['url'].split('/')[-1]
  134. path = osp.join(self._root, ds_name)
  135. filename_archive = osp.join(path, filename)
  136. if filename.endswith('gz'):
  137. if tarfile.is_tarfile(filename_archive):
  138. with tarfile.open(filename_archive, 'r:gz') as tar:
  139. if self._reload and self._verbose:
  140. print(filename + ' Downloaded.')
  141. subpath = os.path.join(path, tar.getnames()[0].split('/')[0])
  142. if not osp.exists(subpath) or self._reload:
  143. tar.extractall(path = path)
  144. return subpath
  145. elif filename.endswith('.tar'):
  146. if tarfile.is_tarfile(filename_archive):
  147. with tarfile.open(filename_archive, 'r:') as tar:
  148. if self._reload and self._verbose:
  149. print(filename + ' Downloaded.')
  150. subpath = os.path.join(path, tar.getnames()[0])
  151. if not osp.exists(subpath) or self._reload:
  152. tar.extractall(path = path)
  153. return subpath
  154. elif filename.endswith('.zip'):
  155. with ZipFile(filename_archive, 'r') as zip_ref:
  156. if self._reload and self._verbose:
  157. print(filename + ' Downloaded.')
  158. subpath = os.path.join(path, zip_ref.namelist()[0])
  159. if not osp.exists(subpath) or self._reload:
  160. zip_ref.extractall(path)
  161. return subpath
  162. else:
  163. raise ValueError(filename + ' Unsupported file.')
  164. def get_all_ds_infos(self, database):
  165. """Get information of all datasets from a database.
  166. Parameters
  167. ----------
  168. database : string
  169. DESCRIPTION.
  170. Returns
  171. -------
  172. None.
  173. """
  174. if database.lower() == 'tudataset':
  175. infos = self.get_all_tud_ds_infos()
  176. elif database.lower() == 'iam':
  177. pass
  178. else:
  179. msg = 'Invalid Database name "' + database + '"'
  180. msg += '\n Available databases are as follows: \n\n'
  181. msg += '\n'.join(db for db in sorted(DATABASES))
  182. msg += 'Check "gklearn.dataset.DATASET_META" for more details.'
  183. raise ValueError(msg)
  184. return infos
  185. def get_all_tud_ds_infos(self):
  186. """Get information of all datasets from database TUDataset.
  187. Returns
  188. -------
  189. None.
  190. """
  191. try:
  192. response = urllib.request.urlopen(DATABASES['tudataset'])
  193. except urllib.error.HTTPError:
  194. print('The URL of the database "TUDataset" is not available:\n' + DATABASES['tudataset'])
  195. infos = {}
  196. # Get tables.
  197. h_str = response.read()
  198. tree = etree.HTML(h_str)
  199. tables = tree.xpath('//table')
  200. for table in tables:
  201. # Get the domain of the datasets.
  202. h2_nodes = table.getprevious()
  203. if h2_nodes is not None and h2_nodes.tag == 'h2':
  204. domain = h2_nodes.text.strip().lower()
  205. else:
  206. domain = ''
  207. # Get each line in the table.
  208. tr_nodes = table.xpath('tbody/tr')
  209. for tr in tr_nodes[1:]:
  210. # Get each element in the line.
  211. td_node = tr.xpath('td')
  212. # task type.
  213. cls_txt = td_node[3].text.strip()
  214. if not cls_txt.startswith('R'):
  215. class_number = int(cls_txt)
  216. task_type = 'classification'
  217. else:
  218. class_number = None
  219. task_type = 'regression'
  220. # node attrs.
  221. na_text = td_node[8].text.strip()
  222. if not na_text.startswith('+'):
  223. node_attr_dim = 0
  224. else:
  225. node_attr_dim = int(re.findall('\((.*)\)', na_text)[0])
  226. # edge attrs.
  227. ea_text = td_node[10].text.strip()
  228. if ea_text == 'temporal':
  229. edge_attr_dim = ea_text
  230. elif not ea_text.startswith('+'):
  231. edge_attr_dim = 0
  232. else:
  233. edge_attr_dim = int(re.findall('\((.*)\)', ea_text)[0])
  234. # geometry.
  235. geo_txt = td_node[9].text.strip()
  236. if geo_txt == '–':
  237. geometry = None
  238. else:
  239. geometry = geo_txt
  240. # url.
  241. url = td_node[11].xpath('a')[0].attrib['href'].strip()
  242. pos_zip = url.rfind('.zip')
  243. url = url[:pos_zip + 4]
  244. infos[td_node[0].xpath('strong')[0].text.strip()] = {
  245. 'database': 'tudataset',
  246. 'reference': td_node[1].text.strip(),
  247. 'dataset_size': int(td_node[2].text.strip()),
  248. 'class_number': class_number,
  249. 'task_type': task_type,
  250. 'ave_node_num': float(td_node[4].text.strip()),
  251. 'ave_edge_num': float(td_node[5].text.strip()),
  252. 'node_labeled': True if td_node[6].text.strip() == '+' else False,
  253. 'edge_labeled': True if td_node[7].text.strip() == '+' else False,
  254. 'node_attr_dim': node_attr_dim,
  255. 'geometry': geometry,
  256. 'edge_attr_dim': edge_attr_dim,
  257. 'url': url,
  258. 'domain': domain
  259. }
  260. return infos
  261. def pretty_ds_infos(self, infos):
  262. """Get the string that pretty prints the information of datasets.
  263. Parameters
  264. ----------
  265. datasets : dict
  266. The datasets' information.
  267. Returns
  268. -------
  269. p_str : string
  270. The pretty print of the datasets' information.
  271. """
  272. p_str = '{\n'
  273. for key, val in infos.items():
  274. p_str += '\t\'' + str(key) + '\': {\n'
  275. for k, v in val.items():
  276. p_str += '\t\t\'' + str(k) + '\': '
  277. if isinstance(v, str):
  278. p_str += '\'' + str(v) + '\',\n'
  279. else:
  280. p_str += '' + str(v) + ',\n'
  281. p_str += '\t},\n'
  282. p_str += '}'
  283. return p_str
  284. @property
  285. def path(self):
  286. return self._path
  287. def dataset(self):
  288. if self.mode == "Tensorflow":
  289. return #something
  290. if self.mode == "Pytorch":
  291. return self.pytorch_dataset
  292. return self.dataset
  293. def info(self):
  294. print(self.info_dataset[self._name])
  295. def iter_load_dataset(self,data):
  296. results = []
  297. for datasets in data :
  298. results.append(loadDataset(osp.join(self._root,self._name,datasets)))
  299. return results
  300. def load_dataset(self,list_files):
  301. if self._name == "Ptc":
  302. if type(self.option) != str or self.option.upper() not in ['FR','FM','MM','MR']:
  303. raise ValueError('option for Ptc dataset needs to be one of : \n fr fm mm mr')
  304. results = []
  305. results.append(loadDataset(osp.join(self.root,self._name,'PTC/Test',self.gender + '.ds')))
  306. results.append(loadDataset(osp.join(self.root,self._name,'PTC/Train',self.gender + '.ds')))
  307. return results
  308. if self.name == "Pah":
  309. maximum_sets = 0
  310. for file in list_files:
  311. if file.endswith('ds'):
  312. maximum_sets = max(maximum_sets,int(file.split('_')[1].split('.')[0]))
  313. self.max_for_letter = maximum_sets
  314. if not type(self.option) == int or self.option > maximum_sets or self.option < 0:
  315. raise ValueError('option needs to be an integer between 0 and ' + str(maximum_sets))
  316. data = self.has_train_valid_test["Pah"]
  317. data[0] = self.has_train_valid_test["Pah"][0].split('_')[0] + '_' + str(self.option) + '.ds'
  318. data[1] = self.has_train_valid_test["Pah"][1].split('_')[0] + '_' + str(self.option) + '.ds'
  319. return self.iter_load_dataset(data)
  320. if self.name == "Letter":
  321. if type(self.option) == str and self.option.upper() in self.has_train_valid_test["Letter"]:
  322. data = self.has_train_valid_test["Letter"][self.option.upper()]
  323. else:
  324. message = "The parameter for letter is incorrect choose between : "
  325. message += "\nhigh med low"
  326. raise ValueError(message)
  327. return self.iter_load_dataset(data)
  328. if self.name in self.has_train_valid_test : #common IAM dataset with train, valid and test
  329. data = self.has_train_valid_test[self.name]
  330. return self.iter_load_dataset(data)
  331. else: #common dataset without train,valid and test, only dataset.ds file
  332. data = self.data_to_use_in_datasets[self.name]
  333. if len(data) > 1 and data[0] in list_files and data[1] in list_files: #case for Alkane
  334. return loadDataset(osp.join(self.root,self.name,data[0]),filename_y = osp.join(self.root,self.name,data[1]))
  335. if data in list_files:
  336. return loadDataset(osp.join(self.root,self.name,data))
  337. def build_dictionary(self,Gs):
  338. labels = set()
  339. #next line : from DeepGraphWithNNTorch
  340. #bond_type_number_maxi = int(max(max([[G[e[0]][e[1]]['bond_type'] for e in G.edges()] for G in Gs])))
  341. sizes = set()
  342. for G in Gs :
  343. for _,node in G.nodes(data = True): # or for node in nx.nodes(G)
  344. #print(_,node)
  345. labels.add(node["label"][0]) # labels.add(G.nodes[node]["label"][0]) #what do we use for IAM datasets (they don't have bond_type or event label) ?
  346. sizes.add(G.order())
  347. label_dict = {}
  348. #print("labels : ", labels, bond_type_number_maxi)
  349. for i,label in enumerate(labels):
  350. label_dict[label] = [0.]*len(labels)
  351. label_dict[label][i] = 1.
  352. return label_dict
  353. def from_networkx_to_pytorch(self,Gs,y):
  354. #exemple for MAO: atom_to_onehot = {'C': [1., 0., 0.], 'N': [0., 1., 0.], 'O': [0., 0., 1.]}
  355. # code from https://github.com/bgauzere/pygnn/blob/master/utils.py
  356. atom_to_onehot = self.build_dictionary(Gs)
  357. max_size = 30
  358. adjs = []
  359. inputs = []
  360. for i, G in enumerate(Gs):
  361. I = torch.eye(G.order(), G.order())
  362. #A = torch.Tensor(nx.adjacency_matrix(G).todense())
  363. #A = torch.Tensor(nx.to_numpy_matrix(G))
  364. A = torch.tensor(nx.to_scipy_sparse_matrix(G,dtype = int,weight = 'bond_type').todense(),dtype = torch.int) #what do we use for IAM datasets (they don't have bond_type or event label) ?
  365. adj = F.pad(A, pad=(0, max_size-G.order(), 0, max_size-G.order())) #add I now ? if yes : F.pad(A + I,pad = (...))
  366. adjs.append(adj)
  367. f_0 = []
  368. for _, label in G.nodes(data=True):
  369. #print(_,label)
  370. cur_label = atom_to_onehot[label['label'][0]].copy()
  371. f_0.append(cur_label)
  372. X = F.pad(torch.Tensor(f_0), pad=(0, 0, 0, max_size-G.order()))
  373. inputs.append(X)
  374. return inputs,adjs,y
  375. def from_pytorch_to_tensorflow(self,batch_size):
  376. seed = random.randrange(sys.maxsize)
  377. random.seed(seed)
  378. tf_inputs = random.sample(self.pytorch_dataset[0],batch_size)
  379. random.seed(seed)
  380. tf_y = random.sample(self.pytorch_dataset[2],batch_size)
  381. def from_networkx_to_tensor(self,G,dict):
  382. A=nx.to_numpy_matrix(G)
  383. lab=[dict[G.nodes[v]['label'][0]] for v in nx.nodes(G)]
  384. return (torch.tensor(A).view(1,A.shape[0]*A.shape[1]),torch.tensor(lab))
  385. #dataset= selfopen_files()
  386. #print(build_dictionary(Gs))
  387. #dic={'C':0,'N':1,'O':2}
  388. #A,labels=from_networkx_to_tensor(Gs[13],dic)
  389. #print(nx.to_numpy_matrix(Gs[13]),labels)
  390. #print(A,labels)
  391. #@todo : from_networkx_to_tensorflow
  392. # dataloader = DataLoader('Acyclic',root = "database",option = 'high',mode = "Pytorch")
  393. # dataloader.info()
  394. # inputs,adjs,y = dataloader.pytorch_dataset
  395. # """
  396. # test,train,valid = dataloader.dataset
  397. # Gs,y = test
  398. # Gs2,y2 = train
  399. # Gs3,y3 = valid
  400. # """
  401. # #Gs,y = dataloader.
  402. # #print(Gs,y)
  403. # """
  404. # Gs,y = dataloader.dataset
  405. # for G in Gs :
  406. # for e in G.edges():
  407. # print(G[e[0]])
  408. # """
  409. # #for e in Gs[13].edges():
  410. # # print(Gs[13][e[0]])
  411. # #print(from_networkx_to_tensor(Gs[7],{'C': [1., 0., 0.], 'N': [0., 1., 0.], 'O': [0., 0., 1.]}))
  412. # #dataset.open_files()
  413. # import os
  414. # import os.path as osp
  415. # import urllib
  416. # import tarfile
  417. # from zipfile import ZipFile
  418. # from gklearn.utils.graphfiles import loadDataset
  419. # import torch
  420. # import torch.nn.functional as F
  421. # import networkx as nx
  422. # import matplotlib.pyplot as plt
  423. # import numpy as np
  424. #
  425. # def DataLoader(name,root = 'data',mode = "Networkx",downloadAll = False,reload = False,letter = "High",number = 0,gender = "MM"):
  426. # dir_name = "_".join(name.split("-"))
  427. # if not osp.exists(root) :
  428. # os.makedirs(root)
  429. # url = "https://brunl01.users.greyc.fr/CHEMISTRY/"
  430. # urliam = "https://iapr-tc15.greyc.fr/IAM/"
  431. # list_database = {
  432. # "Ace" : (url,"ACEDataset.tar"),
  433. # "Acyclic" : (url,"Acyclic.tar.gz"),
  434. # "Aids" : (urliam,"AIDS.zip"),
  435. # "Alkane" : (url,"alkane_dataset.tar.gz"),
  436. # "Chiral" : (url,"DatasetAcyclicChiral.tar"),
  437. # "Coil_Del" : (urliam,"COIL-DEL.zip"),
  438. # "Coil_Rag" : (urliam,"COIL-RAG.zip"),
  439. # "Fingerprint" : (urliam,"Fingerprint.zip"),
  440. # "Grec" : (urliam,"GREC.zip"),
  441. # "Letter" : (urliam,"Letter.zip"),
  442. # "Mao" : (url,"mao.tgz"),
  443. # "Monoterpenoides" : (url,"monoterpenoides.tar.gz"),
  444. # "Mutagenicity" : (urliam,"Mutagenicity.zip"),
  445. # "Pah" : (url,"PAH.tar.gz"),
  446. # "Protein" : (urliam,"Protein.zip"),
  447. # "Ptc" : (url,"ptc.tgz"),
  448. # "Steroid" : (url,"SteroidDataset.tar"),
  449. # "Vitamin" : (url,"DatasetVitamin.tar"),
  450. # "Web" : (urliam,"Web.zip")
  451. # }
  452. #
  453. # data_to_use_in_datasets = {
  454. # "Acyclic" : ("Acyclic/dataset_bps.ds"),
  455. # "Aids" : ("AIDS_A.txt"),
  456. # "Alkane" : ("Alkane/dataset.ds","Alkane/dataset_boiling_point_names.txt"),
  457. # "Mao" : ("MAO/dataset.ds"),
  458. # "Monoterpenoides" : ("monoterpenoides/dataset_10+.ds"), #('monoterpenoides/dataset.ds'),('monoterpenoides/dataset_9.ds'),('monoterpenoides/trainset_9.ds')
  459. #
  460. # }
  461. # has_train_valid_test = {
  462. # "Coil_Del" : ('COIL-DEL/data/test.cxl','COIL-DEL/data/train.cxl','COIL-DEL/data/valid.cxl'),
  463. # "Coil_Rag" : ('COIL-RAG/data/test.cxl','COIL-RAG/data/train.cxl','COIL-RAG/data/valid.cxl'),
  464. # "Fingerprint" : ('Fingerprint/data/test.cxl','Fingerprint/data/train.cxl','Fingerprint/data/valid.cxl'),
  465. # "Grec" : ('GREC/data/test.cxl','GREC/data/train.cxl','GREC/data/valid.cxl'),
  466. # "Letter" : {'HIGH' : ('Letter/HIGH/test.cxl','Letter/HIGH/train.cxl','Letter/HIGH/validation.cxl'),
  467. # 'MED' : ('Letter/MED/test.cxl','Letter/MED/train.cxl','Letter/MED/validation.cxl'),
  468. # 'LOW' : ('Letter/LOW/test.cxl','Letter/LOW/train.cxl','Letter/LOW/validation.cxl')
  469. # },
  470. # "Mutagenicity" : ('Mutagenicity/data/test.cxl','Mutagenicity/data/train.cxl','Mutagenicity/data/validation.cxl'),
  471. # "Pah" : ['PAH/testset_0.ds','PAH/trainset_0.ds'],
  472. # "Protein" : ('Protein/data/test.cxl','Protein/data/train.cxl','Protein/data/valid.cxl'),
  473. # "Web" : ('Web/data/test.cxl','Web/data/train.cxl','Web/data/valid.cxl')
  474. # }
  475. #
  476. # if not name :
  477. # raise ValueError("No dataset entered")
  478. # if name not in list_database:
  479. # message = "Invalid Dataset name " + name
  480. # message += '\n Available datasets are as follows : \n\n'
  481. # message += '\n'.join(database for database in list_database)
  482. # raise ValueError(message)
  483. #
  484. # def download_file(url,filename):
  485. # try :
  486. # response = urllib.request.urlopen(url + filename)
  487. # except urllib.error.HTTPError:
  488. # print(filename + " not available or incorrect http link")
  489. # return
  490. # return response
  491. #
  492. # def write_archive_file(root,database):
  493. # path = osp.join(root,database)
  494. # url,filename = list_database[database]
  495. # filename_dir = osp.join(path,filename)
  496. # if not osp.exists(filename_dir) or reload:
  497. # response = download_file(url,filename)
  498. # if response is None :
  499. # return
  500. # if not osp.exists(path) :
  501. # os.makedirs(path)
  502. # with open(filename_dir,'wb') as outfile :
  503. # outfile.write(response.read())
  504. #
  505. # if downloadAll :
  506. # print('Waiting...')
  507. # for database in list_database :
  508. # write_archive_file(root,database)
  509. # print('Downloading finished')
  510. # else:
  511. # write_archive_file(root,name)
  512. #
  513. # def iter_load_dataset(data):
  514. # results = []
  515. # for datasets in data :
  516. # results.append(loadDataset(osp.join(root,name,datasets)))
  517. # return results
  518. #
  519. # def load_dataset(list_files):
  520. # if name == "Ptc":
  521. # if gender.upper() not in ['FR','FM','MM','MR']:
  522. # raise ValueError('gender chosen needs to be one of \n fr fm mm mr')
  523. # results = []
  524. # results.append(loadDataset(osp.join(root,name,'PTC/Test',gender.upper() + '.ds')))
  525. # results.append(loadDataset(osp.join(root,name,'PTC/Train',gender.upper() + '.ds')))
  526. # return results
  527. # if name == "Pah":
  528. # maximum_sets = 0
  529. # for file in list_files:
  530. # if file.endswith('ds'):
  531. # maximum_sets = max(maximum_sets,int(file.split('_')[1].split('.')[0]))
  532. # if number > maximum_sets :
  533. # raise ValueError("Please select a dataset with number less than " + str(maximum_sets + 1))
  534. # data = has_train_valid_test["Pah"]
  535. # data[0] = has_train_valid_test["Pah"][0].split('_')[0] + '_' + str(number) + '.ds'
  536. # data[1] = has_train_valid_test["Pah"][1].split('_')[0] + '_' + str(number) + '.ds'
  537. # #print(data)
  538. # return iter_load_dataset(data)
  539. # if name == "Letter":
  540. # if letter.upper() in has_train_valid_test["Letter"]:
  541. # data = has_train_valid_test["Letter"][letter.upper()]
  542. # else:
  543. # message = "The parameter for letter is incorrect choose between : "
  544. # message += "\nhigh med low"
  545. # raise ValueError(message)
  546. # results = []
  547. # for datasets in data:
  548. # results.append(loadDataset(osp.join(root,name,datasets)))
  549. # return results
  550. # if name in has_train_valid_test : #common IAM dataset with train, valid and test
  551. # data = has_train_valid_test[name]
  552. # results = []
  553. # for datasets in data :
  554. # results.append(loadDataset(osp.join(root,name,datasets)))
  555. # return results
  556. # else: #common dataset without train,valid and test, only dataset.ds file
  557. # data = data_to_use_in_datasets[name]
  558. # if len(data) > 1 and data[0] in list_files and data[1] in list_files:
  559. # return loadDataset(osp.join(root,name,data[0]),filename_y = osp.join(root,name,data[1]))
  560. # if data in list_files:
  561. # return loadDataset(osp.join(root,name,data))
  562. # def open_files():
  563. # filename = list_database[name][1]
  564. # path = osp.join(root,name)
  565. # filename_archive = osp.join(root,name,filename)
  566. #
  567. # if filename.endswith('gz'):
  568. # if tarfile.is_tarfile(filename_archive):
  569. # with tarfile.open(filename_archive,"r:gz") as tar:
  570. # if reload:
  571. # print(filename + " Downloaded")
  572. # tar.extractall(path = path)
  573. # return load_dataset(tar.getnames())
  574. # #raise ValueError("dataset not available")
  575. #
  576. #
  577. # elif filename.endswith('.tar'):
  578. # if tarfile.is_tarfile(filename_archive):
  579. # with tarfile.open(filename_archive,"r:") as tar:
  580. # if reload :
  581. # print(filename + " Downloaded")
  582. # tar.extractall(path = path)
  583. # return load_dataset(tar.getnames())
  584. # elif filename.endswith('.zip'):
  585. # with ZipFile(filename_archive,"r") as zip_ref:
  586. # if reload :
  587. # print(filename + " Downloaded")
  588. # zip_ref.extractall(path)
  589. # return load_dataset(zip_ref.namelist())
  590. # else:
  591. # print(filename + " Unsupported file")
  592. # """
  593. # with tarfile.open(osp.join(root,name,list_database[name][1]),"r:gz") as files:
  594. # for file in files.getnames():
  595. # print(file)
  596. # """
  597. #
  598. # def build_dictionary(Gs):
  599. # labels = set()
  600. # bond_type_number_maxi = int(max(max([[G[e[0]][e[1]]['bond_type'] for e in G.edges()] for G in Gs])))
  601. # print(bond_type_number_maxi)
  602. # sizes = set()
  603. # for G in Gs :
  604. # for _,node in G.nodes(data = True): # or for node in nx.nodes(G)
  605. # #print(node)
  606. # labels.add(node["label"][0]) # labels.add(G.nodes[node]["label"][0])
  607. # sizes.add(G.order())
  608. # if len(labels) >= bond_type_number_maxi:
  609. # break
  610. # label_dict = {}
  611. # for i,label in enumerate(labels):
  612. # label_dict[label] = [0.]*bond_type_number_maxi
  613. # label_dict[label][i] = 1.
  614. # return label_dict
  615. #
  616. # def from_networkx_to_pytorch(Gs):
  617. # #exemple : atom_to_onehot = {'C': [1., 0., 0.], 'N': [0., 1., 0.], 'O': [0., 0., 1.]}
  618. # # code from https://github.com/bgauzere/pygnn/blob/master/utils.py
  619. # atom_to_onehot = build_dictionary(Gs)
  620. # max_size = 30
  621. # adjs = []
  622. # inputs = []
  623. # for i, G in enumerate(Gs):
  624. # I = torch.eye(G.order(), G.order())
  625. # A = torch.Tensor(nx.adjacency_matrix(G).todense())
  626. # A = torch.tensor(nx.to_scipy_sparse_matrix(G,dtype = int,weight = 'bond_type').todense(),dtype = torch.int)
  627. # adj = F.pad(A+I, pad=(0, max_size-G.order(), 0, max_size-G.order())) #add I now ?
  628. # adjs.append(adj)
  629. # f_0 = []
  630. # for _, label in G.nodes(data=True):
  631. # #print(_,label)
  632. # cur_label = atom_to_onehot[label['label'][0]].copy()
  633. # f_0.append(cur_label)
  634. # X = F.pad(torch.Tensor(f_0), pad=(0, 0, 0, max_size-G.order()))
  635. # inputs.append(X)
  636. # return inputs,adjs,y
  637. #
  638. # def from_networkx_to_tensor(G,dict):
  639. # A=nx.to_numpy_matrix(G)
  640. # lab=[dict[G.nodes[v]['label'][0]] for v in nx.nodes(G)]
  641. # return (torch.tensor(A).view(1,A.shape[0]*A.shape[1]),torch.tensor(lab))
  642. #
  643. # dataset= open_files()
  644. # #print(build_dictionary(Gs))
  645. # #dic={'C':0,'N':1,'O':2}
  646. # #A,labels=from_networkx_to_tensor(Gs[13],dic)
  647. # #print(nx.to_numpy_matrix(Gs[13]),labels)
  648. # #print(A,labels)
  649. #
  650. # """
  651. # for G in Gs :
  652. # for node in nx.nodes(G):
  653. # print(G.nodes[node])
  654. # """
  655. # if mode == "pytorch":
  656. # Gs,y = dataset
  657. # inputs,adjs,y = from_networkx_to_pytorch(Gs)
  658. # print(inputs,adjs)
  659. # return inputs,adjs,y
  660. #
  661. #
  662. # """
  663. # dic = dict()
  664. # for i,l in enumerate(label):
  665. # dic[l] = i
  666. # dic = {'C': 0, 'N': 1, 'O': 2}
  667. # A,labels=from_networkx_to_tensor(Gs[0],dic)
  668. # #print(A,labels)
  669. # return A,labels
  670. # """
  671. #
  672. # return dataset
  673. #
  674. # #open_files()
  675. #
  676. # def label_to_color(label):
  677. # if label == 'C':
  678. # return 0.1
  679. # elif label == 'O':
  680. # return 0.8
  681. #
  682. # def nodes_to_color_sequence(G):
  683. # return [label_to_color(c[1]['label'][0]) for c in G.nodes(data=True)]
  684. # ##############
  685. # """
  686. # dataset = DataLoader('Mao',root = "database")
  687. # print(dataset)
  688. # Gs,y = dataset
  689. # """
  690. # """
  691. # dataset = DataLoader('Alkane',root = "database") # Gs is empty here whereas y isn't -> not working
  692. # Gs,y = dataset
  693. # """
  694. # """
  695. # dataset = DataLoader('Acyclic', root = "database")
  696. # Gs,y = dataset
  697. # """
  698. # """
  699. # dataset = DataLoader('Monoterpenoides', root = "database")
  700. # Gs,y = dataset
  701. # """
  702. # """
  703. # dataset = DataLoader('Pah',root = 'database', number = 8)
  704. # test_set,train_set = dataset
  705. # Gs,y = test_set
  706. # Gs2,y2 = train_set
  707. # """
  708. # """
  709. # dataset = DataLoader('Coil_Del',root = "database")
  710. # test,train,valid = dataset
  711. # Gs,y = test
  712. # Gs2,y2 = train
  713. # Gs3, y3 = valid
  714. # """
  715. # """
  716. # dataset = DataLoader('Coil_Rag',root = "database")
  717. # test,train,valid = dataset
  718. # Gs,y = test
  719. # Gs2,y2 = train
  720. # Gs3, y3 = valid
  721. # """
  722. # """
  723. # dataset = DataLoader('Fingerprint',root = "database")
  724. # test,train,valid = dataset
  725. # Gs,y = test
  726. # Gs2,y2 = train
  727. # Gs3, y3 = valid
  728. # """
  729. # """
  730. # dataset = DataLoader('Grec',root = "database")
  731. # test,train,valid = dataset
  732. # Gs,y = test
  733. # Gs2,y2 = train
  734. # Gs3, y3 = valid
  735. # """
  736. # """
  737. # dataset = DataLoader('Letter',root = "database",letter = 'low') #high low med
  738. # test,train,valid = dataset
  739. # Gs,y = test
  740. # Gs2,y2 = train
  741. # Gs3, y3 = valid
  742. # """
  743. # """
  744. # dataset = DataLoader('Mutagenicity',root = "database")
  745. # test,train,valid = dataset
  746. # Gs,y = test
  747. # Gs2,y2 = train
  748. # Gs3, y3 = valid
  749. # """
  750. # """
  751. # dataset = DataLoader('Protein',root = "database")
  752. # test,train,valid = dataset
  753. # Gs,y = test
  754. # Gs2,y2 = train
  755. # Gs3, y3 = valid
  756. # """
  757. # """
  758. # dataset = DataLoader('Ptc', root = "database",gender = 'fm') # not working, Gs and y are empty perhaps issue coming from loadDataset
  759. # valid,train = dataset
  760. # Gs,y = valid
  761. # Gs2,y2 = train
  762. # """
  763. # """
  764. # dataset = DataLoader('Web', root = "database")
  765. # test,train,valid = dataset
  766. # Gs,y = test
  767. # Gs2,y2 = train
  768. # Gs3,y3 = valid
  769. # """
  770. # print(Gs,y)
  771. # print(len(dataset))
  772. # ##############
  773. # #print('edge max label',max(max([[G[e[0]][e[1]]['bond_type'] for e in G.edges()] for G in Gs])))
  774. # G1 = Gs[13]
  775. # G2 = Gs[23]
  776. # """
  777. # nx.draw_networkx(G1,with_labels=True,node_color = nodes_to_color_sequence(G1),cmap='autumn')
  778. # plt.figure()
  779. # nx.draw_networkx(G2,with_labels=True,node_color = nodes_to_color_sequence(G2),cmap='autumn')
  780. # """
  781. # from pathlib import Path
  782. # DATA_PATH = Path("data")
  783. # def import_datasets():
  784. #
  785. # import urllib
  786. # import tarfile
  787. # from zipfile import ZipFile
  788. # URL = "https://brunl01.users.greyc.fr/CHEMISTRY/"
  789. # URLIAM = "https://iapr-tc15.greyc.fr/IAM/"
  790. #
  791. # LIST_DATABASE = {
  792. # "Pah" : (URL,"PAH.tar.gz"),
  793. # "Mao" : (URL,"mao.tgz"),
  794. # "Ptc" : (URL,"ptc.tgz"),
  795. # "Aids" : (URLIAM,"AIDS.zip"),
  796. # "Acyclic" : (URL,"Acyclic.tar.gz"),
  797. # "Alkane" : (URL,"alkane_dataset.tar.gz"),
  798. # "Chiral" : (URL,"DatasetAcyclicChiral.tar"),
  799. # "Vitamin" : (URL,"DatasetVitamin.tar"),
  800. # "Ace" : (URL,"ACEDataset.tar"),
  801. # "Steroid" : (URL,"SteroidDataset.tar"),
  802. # "Monoterpenoides" : (URL,"monoterpenoides.tar.gz"),
  803. # "Letter" : (URLIAM,"Letter.zip"),
  804. # "Grec" : (URLIAM,"GREC.zip"),
  805. # "Fingerprint" : (URLIAM,"Fingerprint.zip"),
  806. # "Coil_Rag" : (URLIAM,"COIL-RAG.zip"),
  807. # "Coil_Del" : (URLIAM,"COIL-DEL.zip"),
  808. # "Web" : (URLIAM,"Web.zip"),
  809. # "Mutagenicity" : (URLIAM,"Mutagenicity.zip"),
  810. # "Protein" : (URLIAM,"Protein.zip")
  811. # }
  812. # print("Select databases in the list. Select multiple, split by white spaces .\nWrite All to select all of them.\n")
  813. # print(', '.join(database for database in LIST_DATABASE))
  814. # print("Choice : ",end = ' ')
  815. # selected_databases = input().split()
  816. #
  817. # def download_file(url,filename):
  818. # try :
  819. # response = urllib.request.urlopen(url + filename)
  820. # except urllib.error.HTTPError:
  821. # print(filename + " not available or incorrect http link")
  822. # return
  823. # return response
  824. #
  825. # def write_archive_file(database):
  826. #
  827. # PATH = DATA_PATH / database
  828. # url,filename = LIST_DATABASE[database]
  829. # if not (PATH / filename).exists():
  830. # response = download_file(url,filename)
  831. # if response is None :
  832. # return
  833. # if not PATH.exists() :
  834. # PATH.mkdir(parents=True, exist_ok=True)
  835. # with open(PATH/filename,'wb') as outfile :
  836. # outfile.write(response.read())
  837. #
  838. # if filename[-2:] == 'gz':
  839. # if tarfile.is_tarfile(PATH/filename):
  840. # with tarfile.open(PATH/filename,"r:gz") as tar:
  841. # tar.extractall(path = PATH)
  842. # print(filename + ' Downloaded')
  843. # elif filename[-3:] == 'tar':
  844. # if tarfile.is_tarfile(PATH/filename):
  845. # with tarfile.open(PATH/filename,"r:") as tar:
  846. # tar.extractall(path = PATH)
  847. # print(filename + ' Downloaded')
  848. # elif filename[-3:] == 'zip':
  849. # with ZipFile(PATH/filename,"r") as zip_ref:
  850. # zip_ref.extractall(PATH)
  851. # print(filename + ' Downloaded')
  852. # else:
  853. # print("Unsupported file")
  854. # if 'All' in selected_databases:
  855. # print('Waiting...')
  856. # for database in LIST_DATABASE :
  857. # write_archive_file(database)
  858. # print('Finished')
  859. # else:
  860. # print('Waiting...')
  861. # for database in selected_databases :
  862. # if database in LIST_DATABASE :
  863. # write_archive_file(database)
  864. # print('Finished')
  865. # import_datasets()
  866. # class GraphFetcher(object):
  867. #
  868. #
  869. # def __init__(self, filename=None, filename_targets=None, **kwargs):
  870. # if filename is None:
  871. # self._graphs = None
  872. # self._targets = None
  873. # self._node_labels = None
  874. # self._edge_labels = None
  875. # self._node_attrs = None
  876. # self._edge_attrs = None
  877. # else:
  878. # self.load_dataset(filename, filename_targets=filename_targets, **kwargs)
  879. #
  880. # self._substructures = None
  881. # self._node_label_dim = None
  882. # self._edge_label_dim = None
  883. # self._directed = None
  884. # self._dataset_size = None
  885. # self._total_node_num = None
  886. # self._ave_node_num = None
  887. # self._min_node_num = None
  888. # self._max_node_num = None
  889. # self._total_edge_num = None
  890. # self._ave_edge_num = None
  891. # self._min_edge_num = None
  892. # self._max_edge_num = None
  893. # self._ave_node_degree = None
  894. # self._min_node_degree = None
  895. # self._max_node_degree = None
  896. # self._ave_fill_factor = None
  897. # self._min_fill_factor = None
  898. # self._max_fill_factor = None
  899. # self._node_label_nums = None
  900. # self._edge_label_nums = None
  901. # self._node_attr_dim = None
  902. # self._edge_attr_dim = None
  903. # self._class_number = None
  904. #
  905. #
  906. # def load_dataset(self, filename, filename_targets=None, **kwargs):
  907. # self._graphs, self._targets, label_names = load_dataset(filename, filename_targets=filename_targets, **kwargs)
  908. # self._node_labels = label_names['node_labels']
  909. # self._node_attrs = label_names['node_attrs']
  910. # self._edge_labels = label_names['edge_labels']
  911. # self._edge_attrs = label_names['edge_attrs']
  912. # self.clean_labels()
  913. #
  914. #
  915. # def load_graphs(self, graphs, targets=None):
  916. # # this has to be followed by set_labels().
  917. # self._graphs = graphs
  918. # self._targets = targets
  919. # # self.set_labels_attrs() # @todo
  920. #
  921. #
  922. # def load_predefined_dataset(self, ds_name):
  923. # current_path = os.path.dirname(os.path.realpath(__file__)) + '/'
  924. # if ds_name == 'Acyclic':
  925. # ds_file = current_path + '../../datasets/Acyclic/dataset_bps.ds'
  926. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  927. # elif ds_name == 'AIDS':
  928. # ds_file = current_path + '../../datasets/AIDS/AIDS_A.txt'
  929. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  930. # elif ds_name == 'Alkane':
  931. # ds_file = current_path + '../../datasets/Alkane/dataset.ds'
  932. # fn_targets = current_path + '../../datasets/Alkane/dataset_boiling_point_names.txt'
  933. # self._graphs, self._targets, label_names = load_dataset(ds_file, filename_targets=fn_targets)
  934. # elif ds_name == 'COIL-DEL':
  935. # ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt'
  936. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  937. # elif ds_name == 'COIL-RAG':
  938. # ds_file = current_path + '../../datasets/COIL-RAG/COIL-RAG_A.txt'
  939. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  940. # elif ds_name == 'COLORS-3':
  941. # ds_file = current_path + '../../datasets/COLORS-3/COLORS-3_A.txt'
  942. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  943. # elif ds_name == 'Cuneiform':
  944. # ds_file = current_path + '../../datasets/Cuneiform/Cuneiform_A.txt'
  945. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  946. # elif ds_name == 'DD':
  947. # ds_file = current_path + '../../datasets/DD/DD_A.txt'
  948. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  949. # elif ds_name == 'ENZYMES':
  950. # ds_file = current_path + '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'
  951. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  952. # elif ds_name == 'Fingerprint':
  953. # ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt'
  954. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  955. # elif ds_name == 'FRANKENSTEIN':
  956. # ds_file = current_path + '../../datasets/FRANKENSTEIN/FRANKENSTEIN_A.txt'
  957. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  958. # elif ds_name == 'Letter-high': # node non-symb
  959. # ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt'
  960. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  961. # elif ds_name == 'Letter-low': # node non-symb
  962. # ds_file = current_path + '../../datasets/Letter-low/Letter-low_A.txt'
  963. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  964. # elif ds_name == 'Letter-med': # node non-symb
  965. # ds_file = current_path + '../../datasets/Letter-med/Letter-med_A.txt'
  966. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  967. # elif ds_name == 'MAO':
  968. # ds_file = current_path + '../../datasets/MAO/dataset.ds'
  969. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  970. # elif ds_name == 'Monoterpenoides':
  971. # ds_file = current_path + '../../datasets/Monoterpenoides/dataset_10+.ds'
  972. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  973. # elif ds_name == 'MUTAG':
  974. # ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt'
  975. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  976. # elif ds_name == 'NCI1':
  977. # ds_file = current_path + '../../datasets/NCI1/NCI1_A.txt'
  978. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  979. # elif ds_name == 'NCI109':
  980. # ds_file = current_path + '../../datasets/NCI109/NCI109_A.txt'
  981. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  982. # elif ds_name == 'PAH':
  983. # ds_file = current_path + '../../datasets/PAH/dataset.ds'
  984. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  985. # elif ds_name == 'SYNTHETIC':
  986. # pass
  987. # elif ds_name == 'SYNTHETICnew':
  988. # ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt'
  989. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  990. # elif ds_name == 'Synthie':
  991. # pass
  992. # else:
  993. # raise Exception('The dataset name "', ds_name, '" is not pre-defined.')
  994. #
  995. # self._node_labels = label_names['node_labels']
  996. # self._node_attrs = label_names['node_attrs']
  997. # self._edge_labels = label_names['edge_labels']
  998. # self._edge_attrs = label_names['edge_attrs']
  999. # self.clean_labels()
  1000. #
  1001. # def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]):
  1002. # self._node_labels = node_labels
  1003. # self._node_attrs = node_attrs
  1004. # self._edge_labels = edge_labels
  1005. # self._edge_attrs = edge_attrs
  1006. #
  1007. # def set_labels_attrs(self, node_labels=None, node_attrs=None, edge_labels=None, edge_attrs=None):
  1008. # # @todo: remove labels which have only one possible values.
  1009. # if node_labels is None:
  1010. # self._node_labels = self._graphs[0].graph['node_labels']
  1011. # # # graphs are considered node unlabeled if all nodes have the same label.
  1012. # # infos.update({'node_labeled': is_nl if node_label_num > 1 else False})
  1013. # if node_attrs is None:
  1014. # self._node_attrs = self._graphs[0].graph['node_attrs']
  1015. # # for G in Gn:
  1016. # # for n in G.nodes(data=True):
  1017. # # if 'attributes' in n[1]:
  1018. # # return len(n[1]['attributes'])
  1019. # # return 0
  1020. # if edge_labels is None:
  1021. # self._edge_labels = self._graphs[0].graph['edge_labels']
  1022. # # # graphs are considered edge unlabeled if all edges have the same label.
  1023. # # infos.update({'edge_labeled': is_el if edge_label_num > 1 else False})
  1024. # if edge_attrs is None:
  1025. # self._edge_attrs = self._graphs[0].graph['edge_attrs']
  1026. # # for G in Gn:
  1027. # # if nx.number_of_edges(G) > 0:
  1028. # # for e in G.edges(data=True):
  1029. # # if 'attributes' in e[2]:
  1030. # # return len(e[2]['attributes'])
  1031. # # return 0
  1032. #
  1033. #
  1034. # def get_dataset_infos(self, keys=None, params=None):
  1035. # """Computes and returns the structure and property information of the graph dataset.
  1036. #
  1037. # Parameters
  1038. # ----------
  1039. # keys : list, optional
  1040. # A list of strings which indicate which informations will be returned. The
  1041. # possible choices includes:
  1042. #
  1043. # 'substructures': sub-structures graphs contains, including 'linear', 'non
  1044. # linear' and 'cyclic'.
  1045. #
  1046. # 'node_label_dim': whether vertices have symbolic labels.
  1047. #
  1048. # 'edge_label_dim': whether egdes have symbolic labels.
  1049. #
  1050. # 'directed': whether graphs in dataset are directed.
  1051. #
  1052. # 'dataset_size': number of graphs in dataset.
  1053. #
  1054. # 'total_node_num': total number of vertices of all graphs in dataset.
  1055. #
  1056. # 'ave_node_num': average number of vertices of graphs in dataset.
  1057. #
  1058. # 'min_node_num': minimum number of vertices of graphs in dataset.
  1059. #
  1060. # 'max_node_num': maximum number of vertices of graphs in dataset.
  1061. #
  1062. # 'total_edge_num': total number of edges of all graphs in dataset.
  1063. #
  1064. # 'ave_edge_num': average number of edges of graphs in dataset.
  1065. #
  1066. # 'min_edge_num': minimum number of edges of graphs in dataset.
  1067. #
  1068. # 'max_edge_num': maximum number of edges of graphs in dataset.
  1069. #
  1070. # 'ave_node_degree': average vertex degree of graphs in dataset.
  1071. #
  1072. # 'min_node_degree': minimum vertex degree of graphs in dataset.
  1073. #
  1074. # 'max_node_degree': maximum vertex degree of graphs in dataset.
  1075. #
  1076. # 'ave_fill_factor': average fill factor (number_of_edges /
  1077. # (number_of_nodes ** 2)) of graphs in dataset.
  1078. #
  1079. # 'min_fill_factor': minimum fill factor of graphs in dataset.
  1080. #
  1081. # 'max_fill_factor': maximum fill factor of graphs in dataset.
  1082. #
  1083. # 'node_label_nums': list of numbers of symbolic vertex labels of graphs in dataset.
  1084. #
  1085. # 'edge_label_nums': list number of symbolic edge labels of graphs in dataset.
  1086. #
  1087. # 'node_attr_dim': number of dimensions of non-symbolic vertex labels.
  1088. # Extracted from the 'attributes' attribute of graph nodes.
  1089. #
  1090. # 'edge_attr_dim': number of dimensions of non-symbolic edge labels.
  1091. # Extracted from the 'attributes' attribute of graph edges.
  1092. #
  1093. # 'class_number': number of classes. Only available for classification problems.
  1094. #
  1095. # 'all_degree_entropy': the entropy of degree distribution of each graph.
  1096. #
  1097. # 'ave_degree_entropy': the average entropy of degree distribution of all graphs.
  1098. #
  1099. # All informations above will be returned if `keys` is not given.
  1100. #
  1101. # params: dict of dict, optional
  1102. # A dictinary which contains extra parameters for each possible
  1103. # element in ``keys``.
  1104. #
  1105. # Return
  1106. # ------
  1107. # dict
  1108. # Information of the graph dataset keyed by `keys`.
  1109. # """
  1110. # infos = {}
  1111. #
  1112. # if keys == None:
  1113. # keys = [
  1114. # 'substructures',
  1115. # 'node_label_dim',
  1116. # 'edge_label_dim',
  1117. # 'directed',
  1118. # 'dataset_size',
  1119. # 'total_node_num',
  1120. # 'ave_node_num',
  1121. # 'min_node_num',
  1122. # 'max_node_num',
  1123. # 'total_edge_num',
  1124. # 'ave_edge_num',
  1125. # 'min_edge_num',
  1126. # 'max_edge_num',
  1127. # 'ave_node_degree',
  1128. # 'min_node_degree',
  1129. # 'max_node_degree',
  1130. # 'ave_fill_factor',
  1131. # 'min_fill_factor',
  1132. # 'max_fill_factor',
  1133. # 'node_label_nums',
  1134. # 'edge_label_nums',
  1135. # 'node_attr_dim',
  1136. # 'edge_attr_dim',
  1137. # 'class_number',
  1138. # 'all_degree_entropy',
  1139. # 'ave_degree_entropy'
  1140. # ]
  1141. #
  1142. # # dataset size
  1143. # if 'dataset_size' in keys:
  1144. # if self._dataset_size is None:
  1145. # self._dataset_size = self._get_dataset_size()
  1146. # infos['dataset_size'] = self._dataset_size
  1147. #
  1148. # # graph node number
  1149. # if any(i in keys for i in ['total_node_num', 'ave_node_num', 'min_node_num', 'max_node_num']):
  1150. # all_node_nums = self._get_all_node_nums()
  1151. # if 'total_node_num' in keys:
  1152. # if self._total_node_num is None:
  1153. # self._total_node_num = self._get_total_node_num(all_node_nums)
  1154. # infos['total_node_num'] = self._total_node_num
  1155. #
  1156. # if 'ave_node_num' in keys:
  1157. # if self._ave_node_num is None:
  1158. # self._ave_node_num = self._get_ave_node_num(all_node_nums)
  1159. # infos['ave_node_num'] = self._ave_node_num
  1160. #
  1161. # if 'min_node_num' in keys:
  1162. # if self._min_node_num is None:
  1163. # self._min_node_num = self._get_min_node_num(all_node_nums)
  1164. # infos['min_node_num'] = self._min_node_num
  1165. #
  1166. # if 'max_node_num' in keys:
  1167. # if self._max_node_num is None:
  1168. # self._max_node_num = self._get_max_node_num(all_node_nums)
  1169. # infos['max_node_num'] = self._max_node_num
  1170. #
  1171. # # graph edge number
  1172. # if any(i in keys for i in ['total_edge_num', 'ave_edge_num', 'min_edge_num', 'max_edge_num']):
  1173. # all_edge_nums = self._get_all_edge_nums()
  1174. # if 'total_edge_num' in keys:
  1175. # if self._total_edge_num is None:
  1176. # self._total_edge_num = self._get_total_edge_num(all_edge_nums)
  1177. # infos['total_edge_num'] = self._total_edge_num
  1178. #
  1179. # if 'ave_edge_num' in keys:
  1180. # if self._ave_edge_num is None:
  1181. # self._ave_edge_num = self._get_ave_edge_num(all_edge_nums)
  1182. # infos['ave_edge_num'] = self._ave_edge_num
  1183. #
  1184. # if 'max_edge_num' in keys:
  1185. # if self._max_edge_num is None:
  1186. # self._max_edge_num = self._get_max_edge_num(all_edge_nums)
  1187. # infos['max_edge_num'] = self._max_edge_num
  1188. # if 'min_edge_num' in keys:
  1189. # if self._min_edge_num is None:
  1190. # self._min_edge_num = self._get_min_edge_num(all_edge_nums)
  1191. # infos['min_edge_num'] = self._min_edge_num
  1192. #
  1193. # # label number
  1194. # if 'node_label_dim' in keys:
  1195. # if self._node_label_dim is None:
  1196. # self._node_label_dim = self._get_node_label_dim()
  1197. # infos['node_label_dim'] = self._node_label_dim
  1198. #
  1199. # if 'node_label_nums' in keys:
  1200. # if self._node_label_nums is None:
  1201. # self._node_label_nums = {}
  1202. # for node_label in self._node_labels:
  1203. # self._node_label_nums[node_label] = self._get_node_label_num(node_label)
  1204. # infos['node_label_nums'] = self._node_label_nums
  1205. #
  1206. # if 'edge_label_dim' in keys:
  1207. # if self._edge_label_dim is None:
  1208. # self._edge_label_dim = self._get_edge_label_dim()
  1209. # infos['edge_label_dim'] = self._edge_label_dim
  1210. #
  1211. # if 'edge_label_nums' in keys:
  1212. # if self._edge_label_nums is None:
  1213. # self._edge_label_nums = {}
  1214. # for edge_label in self._edge_labels:
  1215. # self._edge_label_nums[edge_label] = self._get_edge_label_num(edge_label)
  1216. # infos['edge_label_nums'] = self._edge_label_nums
  1217. #
  1218. # if 'directed' in keys or 'substructures' in keys:
  1219. # if self._directed is None:
  1220. # self._directed = self._is_directed()
  1221. # infos['directed'] = self._directed
  1222. #
  1223. # # node degree
  1224. # if any(i in keys for i in ['ave_node_degree', 'max_node_degree', 'min_node_degree']):
  1225. # all_node_degrees = self._get_all_node_degrees()
  1226. #
  1227. # if 'ave_node_degree' in keys:
  1228. # if self._ave_node_degree is None:
  1229. # self._ave_node_degree = self._get_ave_node_degree(all_node_degrees)
  1230. # infos['ave_node_degree'] = self._ave_node_degree
  1231. #
  1232. # if 'max_node_degree' in keys:
  1233. # if self._max_node_degree is None:
  1234. # self._max_node_degree = self._get_max_node_degree(all_node_degrees)
  1235. # infos['max_node_degree'] = self._max_node_degree
  1236. #
  1237. # if 'min_node_degree' in keys:
  1238. # if self._min_node_degree is None:
  1239. # self._min_node_degree = self._get_min_node_degree(all_node_degrees)
  1240. # infos['min_node_degree'] = self._min_node_degree
  1241. #
  1242. # # fill factor
  1243. # if any(i in keys for i in ['ave_fill_factor', 'max_fill_factor', 'min_fill_factor']):
  1244. # all_fill_factors = self._get_all_fill_factors()
  1245. #
  1246. # if 'ave_fill_factor' in keys:
  1247. # if self._ave_fill_factor is None:
  1248. # self._ave_fill_factor = self._get_ave_fill_factor(all_fill_factors)
  1249. # infos['ave_fill_factor'] = self._ave_fill_factor
  1250. #
  1251. # if 'max_fill_factor' in keys:
  1252. # if self._max_fill_factor is None:
  1253. # self._max_fill_factor = self._get_max_fill_factor(all_fill_factors)
  1254. # infos['max_fill_factor'] = self._max_fill_factor
  1255. #
  1256. # if 'min_fill_factor' in keys:
  1257. # if self._min_fill_factor is None:
  1258. # self._min_fill_factor = self._get_min_fill_factor(all_fill_factors)
  1259. # infos['min_fill_factor'] = self._min_fill_factor
  1260. #
  1261. # if 'substructures' in keys:
  1262. # if self._substructures is None:
  1263. # self._substructures = self._get_substructures()
  1264. # infos['substructures'] = self._substructures
  1265. #
  1266. # if 'class_number' in keys:
  1267. # if self._class_number is None:
  1268. # self._class_number = self._get_class_number()
  1269. # infos['class_number'] = self._class_number
  1270. #
  1271. # if 'node_attr_dim' in keys:
  1272. # if self._node_attr_dim is None:
  1273. # self._node_attr_dim = self._get_node_attr_dim()
  1274. # infos['node_attr_dim'] = self._node_attr_dim
  1275. #
  1276. # if 'edge_attr_dim' in keys:
  1277. # if self._edge_attr_dim is None:
  1278. # self._edge_attr_dim = self._get_edge_attr_dim()
  1279. # infos['edge_attr_dim'] = self._edge_attr_dim
  1280. #
  1281. # # entropy of degree distribution.
  1282. #
  1283. # if 'all_degree_entropy' in keys:
  1284. # if params is not None and ('all_degree_entropy' in params) and ('base' in params['all_degree_entropy']):
  1285. # base = params['all_degree_entropy']['base']
  1286. # else:
  1287. # base = None
  1288. # infos['all_degree_entropy'] = self._compute_all_degree_entropy(base=base)
  1289. #
  1290. # if 'ave_degree_entropy' in keys:
  1291. # if params is not None and ('ave_degree_entropy' in params) and ('base' in params['ave_degree_entropy']):
  1292. # base = params['ave_degree_entropy']['base']
  1293. # else:
  1294. # base = None
  1295. # infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base))
  1296. #
  1297. # return infos
  1298. #
  1299. #
  1300. # def print_graph_infos(self, infos):
  1301. # from collections import OrderedDict
  1302. # keys = list(infos.keys())
  1303. # print(OrderedDict(sorted(infos.items(), key=lambda i: keys.index(i[0]))))
  1304. #
  1305. #
  1306. # def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]):
  1307. # node_labels = [item for item in node_labels if item in self._node_labels]
  1308. # edge_labels = [item for item in edge_labels if item in self._edge_labels]
  1309. # node_attrs = [item for item in node_attrs if item in self._node_attrs]
  1310. # edge_attrs = [item for item in edge_attrs if item in self._edge_attrs]
  1311. # for g in self._graphs:
  1312. # for nd in g.nodes():
  1313. # for nl in node_labels:
  1314. # del g.nodes[nd][nl]
  1315. # for na in node_attrs:
  1316. # del g.nodes[nd][na]
  1317. # for ed in g.edges():
  1318. # for el in edge_labels:
  1319. # del g.edges[ed][el]
  1320. # for ea in edge_attrs:
  1321. # del g.edges[ed][ea]
  1322. # if len(node_labels) > 0:
  1323. # self._node_labels = [nl for nl in self._node_labels if nl not in node_labels]
  1324. # if len(edge_labels) > 0:
  1325. # self._edge_labels = [el for el in self._edge_labels if el not in edge_labels]
  1326. # if len(node_attrs) > 0:
  1327. # self._node_attrs = [na for na in self._node_attrs if na not in node_attrs]
  1328. # if len(edge_attrs) > 0:
  1329. # self._edge_attrs = [ea for ea in self._edge_attrs if ea not in edge_attrs]
  1330. #
  1331. #
  1332. # def clean_labels(self):
  1333. # labels = []
  1334. # for name in self._node_labels:
  1335. # label = set()
  1336. # for G in self._graphs:
  1337. # label = label | set(nx.get_node_attributes(G, name).values())
  1338. # if len(label) > 1:
  1339. # labels.append(name)
  1340. # break
  1341. # if len(label) < 2:
  1342. # for G in self._graphs:
  1343. # for nd in G.nodes():
  1344. # del G.nodes[nd][name]
  1345. # self._node_labels = labels
  1346. # labels = []
  1347. # for name in self._edge_labels:
  1348. # label = set()
  1349. # for G in self._graphs:
  1350. # label = label | set(nx.get_edge_attributes(G, name).values())
  1351. # if len(label) > 1:
  1352. # labels.append(name)
  1353. # break
  1354. # if len(label) < 2:
  1355. # for G in self._graphs:
  1356. # for ed in G.edges():
  1357. # del G.edges[ed][name]
  1358. # self._edge_labels = labels
  1359. # labels = []
  1360. # for name in self._node_attrs:
  1361. # label = set()
  1362. # for G in self._graphs:
  1363. # label = label | set(nx.get_node_attributes(G, name).values())
  1364. # if len(label) > 1:
  1365. # labels.append(name)
  1366. # break
  1367. # if len(label) < 2:
  1368. # for G in self._graphs:
  1369. # for nd in G.nodes():
  1370. # del G.nodes[nd][name]
  1371. # self._node_attrs = labels
  1372. # labels = []
  1373. # for name in self._edge_attrs:
  1374. # label = set()
  1375. # for G in self._graphs:
  1376. # label = label | set(nx.get_edge_attributes(G, name).values())
  1377. # if len(label) > 1:
  1378. # labels.append(name)
  1379. # break
  1380. # if len(label) < 2:
  1381. # for G in self._graphs:
  1382. # for ed in G.edges():
  1383. # del G.edges[ed][name]
  1384. # self._edge_attrs = labels
  1385. #
  1386. #
  1387. # def cut_graphs(self, range_):
  1388. # self._graphs = [self._graphs[i] for i in range_]
  1389. # if self._targets is not None:
  1390. # self._targets = [self._targets[i] for i in range_]
  1391. # self.clean_labels()
  1392. # def trim_dataset(self, edge_required=False):
  1393. # if edge_required:
  1394. # trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if (nx.number_of_nodes(g) != 0 and nx.number_of_edges(g) != 0)]
  1395. # else:
  1396. # trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if nx.number_of_nodes(g) != 0]
  1397. # idx = [p[0] for p in trimed_pairs]
  1398. # self._graphs = [p[1] for p in trimed_pairs]
  1399. # self._targets = [self._targets[i] for i in idx]
  1400. # self.clean_labels()
  1401. #
  1402. #
  1403. # def copy(self):
  1404. # dataset = Dataset()
  1405. # graphs = [g.copy() for g in self._graphs] if self._graphs is not None else None
  1406. # target = self._targets.copy() if self._targets is not None else None
  1407. # node_labels = self._node_labels.copy() if self._node_labels is not None else None
  1408. # node_attrs = self._node_attrs.copy() if self._node_attrs is not None else None
  1409. # edge_labels = self._edge_labels.copy() if self._edge_labels is not None else None
  1410. # edge_attrs = self._edge_attrs.copy() if self._edge_attrs is not None else None
  1411. # dataset.load_graphs(graphs, target)
  1412. # dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs)
  1413. # # @todo: clean_labels and add other class members?
  1414. # return dataset
  1415. #
  1416. #
  1417. # def get_all_node_labels(self):
  1418. # node_labels = []
  1419. # for g in self._graphs:
  1420. # for n in g.nodes():
  1421. # nl = tuple(g.nodes[n].items())
  1422. # if nl not in node_labels:
  1423. # node_labels.append(nl)
  1424. # return node_labels
  1425. #
  1426. #
  1427. # def get_all_edge_labels(self):
  1428. # edge_labels = []
  1429. # for g in self._graphs:
  1430. # for e in g.edges():
  1431. # el = tuple(g.edges[e].items())
  1432. # if el not in edge_labels:
  1433. # edge_labels.append(el)
  1434. # return edge_labels
  1435. #
  1436. #
  1437. # def _get_dataset_size(self):
  1438. # return len(self._graphs)
  1439. #
  1440. #
  1441. # def _get_all_node_nums(self):
  1442. # return [nx.number_of_nodes(G) for G in self._graphs]
  1443. #
  1444. #
  1445. # def _get_total_node_nums(self, all_node_nums):
  1446. # return np.sum(all_node_nums)
  1447. #
  1448. #
  1449. # def _get_ave_node_num(self, all_node_nums):
  1450. # return np.mean(all_node_nums)
  1451. #
  1452. #
  1453. # def _get_min_node_num(self, all_node_nums):
  1454. # return np.amin(all_node_nums)
  1455. #
  1456. #
  1457. # def _get_max_node_num(self, all_node_nums):
  1458. # return np.amax(all_node_nums)
  1459. #
  1460. #
  1461. # def _get_all_edge_nums(self):
  1462. # return [nx.number_of_edges(G) for G in self._graphs]
  1463. #
  1464. #
  1465. # def _get_total_edge_nums(self, all_edge_nums):
  1466. # return np.sum(all_edge_nums)
  1467. #
  1468. #
  1469. # def _get_ave_edge_num(self, all_edge_nums):
  1470. # return np.mean(all_edge_nums)
  1471. #
  1472. #
  1473. # def _get_min_edge_num(self, all_edge_nums):
  1474. # return np.amin(all_edge_nums)
  1475. #
  1476. #
  1477. # def _get_max_edge_num(self, all_edge_nums):
  1478. # return np.amax(all_edge_nums)
  1479. #
  1480. #
  1481. # def _get_node_label_dim(self):
  1482. # return len(self._node_labels)
  1483. #
  1484. #
  1485. # def _get_node_label_num(self, node_label):
  1486. # nl = set()
  1487. # for G in self._graphs:
  1488. # nl = nl | set(nx.get_node_attributes(G, node_label).values())
  1489. # return len(nl)
  1490. #
  1491. #
  1492. # def _get_edge_label_dim(self):
  1493. # return len(self._edge_labels)
  1494. #
  1495. #
  1496. # def _get_edge_label_num(self, edge_label):
  1497. # el = set()
  1498. # for G in self._graphs:
  1499. # el = el | set(nx.get_edge_attributes(G, edge_label).values())
  1500. # return len(el)
  1501. #
  1502. #
  1503. # def _is_directed(self):
  1504. # return nx.is_directed(self._graphs[0])
  1505. #
  1506. #
  1507. # def _get_all_node_degrees(self):
  1508. # return [np.mean(list(dict(G.degree()).values())) for G in self._graphs]
  1509. #
  1510. #
  1511. # def _get_ave_node_degree(self, all_node_degrees):
  1512. # return np.mean(all_node_degrees)
  1513. #
  1514. #
  1515. # def _get_max_node_degree(self, all_node_degrees):
  1516. # return np.amax(all_node_degrees)
  1517. #
  1518. #
  1519. # def _get_min_node_degree(self, all_node_degrees):
  1520. # return np.amin(all_node_degrees)
  1521. #
  1522. #
  1523. # def _get_all_fill_factors(self):
  1524. # """Get fill factor, the number of non-zero entries in the adjacency matrix.
  1525. # Returns
  1526. # -------
  1527. # list[float]
  1528. # List of fill factors for all graphs.
  1529. # """
  1530. # return [nx.number_of_edges(G) / (nx.number_of_nodes(G) ** 2) for G in self._graphs]
  1531. #
  1532. # def _get_ave_fill_factor(self, all_fill_factors):
  1533. # return np.mean(all_fill_factors)
  1534. #
  1535. #
  1536. # def _get_max_fill_factor(self, all_fill_factors):
  1537. # return np.amax(all_fill_factors)
  1538. #
  1539. #
  1540. # def _get_min_fill_factor(self, all_fill_factors):
  1541. # return np.amin(all_fill_factors)
  1542. #
  1543. #
  1544. # def _get_substructures(self):
  1545. # subs = set()
  1546. # for G in self._graphs:
  1547. # degrees = list(dict(G.degree()).values())
  1548. # if any(i == 2 for i in degrees):
  1549. # subs.add('linear')
  1550. # if np.amax(degrees) >= 3:
  1551. # subs.add('non linear')
  1552. # if 'linear' in subs and 'non linear' in subs:
  1553. # break
  1554. # if self._directed:
  1555. # for G in self._graphs:
  1556. # if len(list(nx.find_cycle(G))) > 0:
  1557. # subs.add('cyclic')
  1558. # break
  1559. # # else:
  1560. # # # @todo: this method does not work for big graph with large amount of edges like D&D, try a better way.
  1561. # # upper = np.amin([nx.number_of_edges(G) for G in Gn]) * 2 + 10
  1562. # # for G in Gn:
  1563. # # if (nx.number_of_edges(G) < upper):
  1564. # # cyc = list(nx.simple_cycles(G.to_directed()))
  1565. # # if any(len(i) > 2 for i in cyc):
  1566. # # subs.add('cyclic')
  1567. # # break
  1568. # # if 'cyclic' not in subs:
  1569. # # for G in Gn:
  1570. # # cyc = list(nx.simple_cycles(G.to_directed()))
  1571. # # if any(len(i) > 2 for i in cyc):
  1572. # # subs.add('cyclic')
  1573. # # break
  1574. #
  1575. # return subs
  1576. #
  1577. #
  1578. # def _get_class_num(self):
  1579. # return len(set(self._targets))
  1580. #
  1581. #
  1582. # def _get_node_attr_dim(self):
  1583. # return len(self._node_attrs)
  1584. #
  1585. #
  1586. # def _get_edge_attr_dim(self):
  1587. # return len(self._edge_attrs)
  1588. #
  1589. # def _compute_all_degree_entropy(self, base=None):
  1590. # """Compute the entropy of degree distribution of each graph.
  1591. # Parameters
  1592. # ----------
  1593. # base : float, optional
  1594. # The logarithmic base to use. The default is ``e`` (natural logarithm).
  1595. # Returns
  1596. # -------
  1597. # degree_entropy : float
  1598. # The calculated entropy.
  1599. # """
  1600. # from gklearn.utils.stats import entropy
  1601. #
  1602. # degree_entropy = []
  1603. # for g in self._graphs:
  1604. # degrees = list(dict(g.degree()).values())
  1605. # en = entropy(degrees, base=base)
  1606. # degree_entropy.append(en)
  1607. # return degree_entropy
  1608. #
  1609. #
  1610. # @property
  1611. # def graphs(self):
  1612. # return self._graphs
  1613. # @property
  1614. # def targets(self):
  1615. # return self._targets
  1616. #
  1617. #
  1618. # @property
  1619. # def node_labels(self):
  1620. # return self._node_labels
  1621. # @property
  1622. # def edge_labels(self):
  1623. # return self._edge_labels
  1624. #
  1625. #
  1626. # @property
  1627. # def node_attrs(self):
  1628. # return self._node_attrs
  1629. #
  1630. #
  1631. # @property
  1632. # def edge_attrs(self):
  1633. # return self._edge_attrs
  1634. #
  1635. #
  1636. # def split_dataset_by_target(dataset):
  1637. # from gklearn.preimage.utils import get_same_item_indices
  1638. #
  1639. # graphs = dataset.graphs
  1640. # targets = dataset.targets
  1641. # datasets = []
  1642. # idx_targets = get_same_item_indices(targets)
  1643. # for key, val in idx_targets.items():
  1644. # sub_graphs = [graphs[i] for i in val]
  1645. # sub_dataset = Dataset()
  1646. # sub_dataset.load_graphs(sub_graphs, [key] * len(val))
  1647. # node_labels = dataset.node_labels.copy() if dataset.node_labels is not None else None
  1648. # node_attrs = dataset.node_attrs.copy() if dataset.node_attrs is not None else None
  1649. # edge_labels = dataset.edge_labels.copy() if dataset.edge_labels is not None else None
  1650. # edge_attrs = dataset.edge_attrs.copy() if dataset.edge_attrs is not None else None
  1651. # sub_dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs)
  1652. # datasets.append(sub_dataset)
  1653. # # @todo: clean_labels?
  1654. # return datasets

A Python package for graph kernels, graph edit distances and graph pre-image problem.