You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

data_fetcher.py 63 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Tue Oct 20 14:25:49 2020
  5. @author:
  6. Paul Zanoncelli, paul.zanoncelli@ecole.ensicaen.fr
  7. Luc Brun luc.brun@ensicaen.fr
  8. Sebastien Bougleux sebastien.bougleux@unicaen.fr
  9. benoit gaüzère benoit.gauzere@insa-rouen.fr
  10. Linlin Jia linlin.jia@insa-rouen.fr
  11. """
  12. import numpy as np
  13. import networkx as nx
  14. from gklearn.utils.graph_files import load_dataset
  15. import os
  16. import os
  17. import os.path as osp
  18. import urllib
  19. import tarfile
  20. from zipfile import ZipFile
  21. from gklearn.utils.graphfiles import loadDataset
  22. import torch.nn.functional as F
  23. import networkx as nx
  24. import torch
  25. import random
  26. import sys
  27. from lxml import etree
  28. import re
  29. from gklearn.dataset import DATABASES
  30. class DataFetcher():
  31. def __init__(self,name='Ace',root = 'data',downloadAll = False,reload = False,mode = 'Networkx', option = None): # option : number, gender, letter
  32. self.name = name
  33. self.dir_name = "_".join(name.split("-"))
  34. self.root = root
  35. self.option = option
  36. self.mode = mode
  37. if not osp.exists(self.root) :
  38. os.makedirs(self.root)
  39. self.url = "https://brunl01.users.greyc.fr/CHEMISTRY/"
  40. self.urliam = "https://iapr-tc15.greyc.fr/IAM/"
  41. self.downloadAll = downloadAll
  42. self.reload = reload
  43. self.list_database = {
  44. # "Ace" : (self.url,"ACEDataset.tar"),
  45. # "Acyclic" : (self.url,"Acyclic.tar.gz"),
  46. # "Aids" : (self.urliam,"AIDS.zip"),
  47. # "Alkane" : (self.url,"alkane_dataset.tar.gz"),
  48. # "Chiral" : (self.url,"DatasetAcyclicChiral.tar"),
  49. # "Coil_Del" : (self.urliam,"COIL-DEL.zip"),
  50. # "Coil_Rag" : (self.urliam,"COIL-RAG.zip"),
  51. # "Fingerprint" : (self.urliam,"Fingerprint.zip"),
  52. # "Grec" : (self.urliam,"GREC.zip"),
  53. # "Letter" : (self.urliam,"Letter.zip"),
  54. # "Mao" : (self.url,"mao.tgz"),
  55. # "Monoterpenoides" : (self.url,"monoterpenoides.tar.gz"),
  56. # "Mutagenicity" : (self.urliam,"Mutagenicity.zip"),
  57. # "Pah" : (self.url,"PAH.tar.gz"),
  58. # "Protein" : (self.urliam,"Protein.zip"),
  59. # "Ptc" : (self.url,"ptc.tgz"),
  60. # "Steroid" : (self.url,"SteroidDataset.tar"),
  61. # "Vitamin" : (self.url,"DatasetVitamin.tar"),
  62. # "Web" : (self.urliam,"Web.zip")
  63. }
  64. self.data_to_use_in_datasets = {
  65. # "Acyclic" : ("Acyclic/dataset_bps.ds"),
  66. # "Aids" : ("AIDS_A.txt"),
  67. # "Alkane" : ("Alkane/dataset.ds","Alkane/dataset_boiling_point_names.txt"),
  68. # "Mao" : ("MAO/dataset.ds"),
  69. # "Monoterpenoides" : ("monoterpenoides/dataset_10+.ds"), #('monoterpenoides/dataset.ds'),('monoterpenoides/dataset_9.ds'),('monoterpenoides/trainset_9.ds')
  70. }
  71. self.has_train_valid_test = {
  72. "Coil_Del" : ('COIL-DEL/data/test.cxl','COIL-DEL/data/train.cxl','COIL-DEL/data/valid.cxl'),
  73. "Coil_Rag" : ('COIL-RAG/data/test.cxl','COIL-RAG/data/train.cxl','COIL-RAG/data/valid.cxl'),
  74. "Fingerprint" : ('Fingerprint/data/test.cxl','Fingerprint/data/train.cxl','Fingerprint/data/valid.cxl'),
  75. # "Grec" : ('GREC/data/test.cxl','GREC/data/train.cxl','GREC/data/valid.cxl'),
  76. "Letter" : {'HIGH' : ('Letter/HIGH/test.cxl','Letter/HIGH/train.cxl','Letter/HIGH/validation.cxl'),
  77. 'MED' : ('Letter/MED/test.cxl','Letter/MED/train.cxl','Letter/MED/validation.cxl'),
  78. 'LOW' : ('Letter/LOW/test.cxl','Letter/LOW/train.cxl','Letter/LOW/validation.cxl')
  79. },
  80. "Mutagenicity" : ('Mutagenicity/data/test.cxl','Mutagenicity/data/train.cxl','Mutagenicity/data/validation.cxl'),
  81. # "Pah" : ['PAH/testset_0.ds','PAH/trainset_0.ds'],
  82. "Protein" : ('Protein/data/test.cxl','Protein/data/train.cxl','Protein/data/valid.cxl'),
  83. # "Web" : ('Web/data/test.cxl','Web/data/train.cxl','Web/data/valid.cxl')
  84. }
  85. # if not self.name :
  86. # raise ValueError("No dataset entered" )
  87. # if self.name not in self.list_database:
  88. # message = "Invalid Dataset name " + self.name
  89. # message += '\n Available datasets are as follows : \n\n'
  90. #
  91. # message += '\n'.join(database for database in self.list_database)
  92. # raise ValueError(message)
  93. # if self.downloadAll :
  94. # print('Waiting...')
  95. # for database in self.list_database :
  96. # self.write_archive_file(database)
  97. # print('Finished')
  98. # else:
  99. # self.write_archive_file(self.name)
  100. # self.max_for_letter = 0
  101. # self.dataset = self.open_files()
  102. self.info_dataset = {
  103. # 'Ace' : "This dataset is not available yet",
  104. # 'Acyclic' : "This dataset isn't composed of valid, test, train dataset but one whole dataset \ndataloader = DataLoader('Acyclic,root = ...') \nGs,y = dataloader.dataset ",
  105. # 'Aids' : "This dataset is not available yet",
  106. # 'Alkane' : "This dataset isn't composed of valid, test, train dataset but one whole dataset \ndataloader = DataLoader('Acyclic',root = ...) \nGs,y = dataloader.dataset ",
  107. # 'Chiral' : "This dataset is not available yet",
  108. # "Coil-Del" : "This dataset has test,train,valid datasets. \ndataloader = DataLoader('Coil-Deg', root = ...). \ntest,train,valid = dataloader.dataset \nGs_test,y_test = test \nGs_train,y_train = train \nGs_valid,y_valid = valid",
  109. # "Coil-Rag" : "This dataset has test,train,valid datasets. \ndataloader = DataLoader('Coil-Rag', root = ...). \ntest,train,valid = dataloader.dataset \nGs_test,y_test = test \nGs_train,y_train = train\n Gs_valid,y_valid = valid",
  110. # "Fingerprint" : "This dataset has test,train,valid datasets. \ndataloader = DataLoader('Fingerprint', root = ...). \ntest,train,valid = dataloader.dataset. \nGs_test,y_test = test \nGs_train,y_train = train\n Gs_valid,y_valid = valid",
  111. # "Grec" : "This dataset has test,train,valid datasets. Write dataloader = DataLoader('Grec', root = ...). \ntest,train,valid = dataloader.dataset. \nGs_test,y_test = test\n Gs_train,y_train = train\n Gs_valid,y_valid = valid",
  112. # "Letter" : "This dataset has test,train,valid datasets. Choose between high,low,med dataset. \ndataloader = DataLoader('Letter', root = ..., option = 'high') \ntest,train,valid = dataloader.dataset \nGs_test,y_test = test \nGs_train,y_train = train \nGs_valid,y_valid = valid",
  113. # 'Mao' : "This dataset isn't composed of valid, test, train dataset but one whole dataset \ndataloader = DataLoader('Mao',root= ...) \nGs,y = dataloader.dataset ",
  114. # 'Monoterpenoides': "This dataset isn't composed of valid, test, train dataset but one whole dataset\n Write dataloader = DataLoader('Monoterpenoides',root= ...) \nGs,y = dataloader.dataset ",
  115. # 'Mutagenicity' : "This dataset has test,train,valid datasets. \ndataloader = DataLoader('Mutagenicity', root = ...) \ntest,train,valid = dataloader.dataset \nGs_test,y_test = test\n Gs_train,y_train = train \nGs_valid,y_valid = valid",
  116. # 'Pah' : 'This dataset is composed of test and train datasets. '+ str(self.max_for_letter + 1) + ' datasets are available. \nChoose number between 0 and ' + str(self.max_for_letter) + "\ndataloader = DataLoader('Pah', root = ...,option = 0) \ntest,train = dataloader.dataset \nGs_test,y_test = test \nGs_train,y_train = train\n ",
  117. # "Protein" : "This dataset has test,train,valid dataset. \ndataloader = DataLoader('Protein', root = ...) \n test,train,valid = dataloader.dataset \nGs_test,y_test = test \nGs_train,y_train = train \nGs_valid,y_valid = valid",
  118. # "Ptc" : "This dataset has test and train datasets. Select gender between mm, fm, mr, fr. \ndataloader = DataLoader('Ptc',root = ...,option = 'mm') \ntest,train = dataloader.dataset \nGs_test,y_test = test \nGs_train_,y_train = train",
  119. # "Steroid" : "This dataset is not available yet",
  120. # 'Vitamin' : "This dataset is not available yet",
  121. # 'Web' : "This dataset has test,train,valid datasets. \ndataloader = DataLoader('Web', root = ...) \n test,train,valid = dataloader.dataset \nGs_test,y_test = test \nGs_train,y_train = train \nGs_valid,y_valid = valid",
  122. }
  123. if mode == "Pytorch":
  124. if self.name in self.data_to_use_in_datasets :
  125. Gs,y = self.dataset
  126. inputs,adjs,y = self.from_networkx_to_pytorch(Gs,y)
  127. #print(inputs,adjs)
  128. self.pytorch_dataset = inputs,adjs,y
  129. elif self.name == "Pah":
  130. self.pytorch_dataset = []
  131. test,train = self.dataset
  132. Gs_test,y_test = test
  133. Gs_train,y_train = train
  134. self.pytorch_dataset.append(self.from_networkx_to_pytorch(Gs_test,y_test))
  135. self.pytorch_dataset.append(self.from_networkx_to_pytorch(Gs_train,y_train))
  136. elif self.name in self.has_train_valid_test:
  137. self.pytorch_dataset = []
  138. #[G[e[0]][e[1]]['bond_type'] for e in G.edges()] for G in Gs])
  139. test,train,valid = self.dataset
  140. Gs_test,y_test = test
  141. Gs_train,y_train = train
  142. Gs_valid,y_valid = valid
  143. self.pytorch_dataset.append(self.from_networkx_to_pytorch(Gs_test,y_test))
  144. self.pytorch_dataset.append(self.from_networkx_to_pytorch(Gs_train,y_train))
  145. self.pytorch_dataset.append(self.from_networkx_to_pytorch(Gs_valid,y_valid))
  146. #############
  147. """
  148. for G in Gs :
  149. for e in G.edges():
  150. print(G[e[0]])
  151. """
  152. ##############
  153. def download_file(self,url,filename):
  154. try :
  155. response = urllib.request.urlopen(url + filename)
  156. except urllib.error.HTTPError:
  157. print(filename + " not available or incorrect http link")
  158. return
  159. return response
  160. def write_archive_file(self,database):
  161. path = osp.join(self.root,database)
  162. url,filename = self.list_database[database]
  163. filename_dir = osp.join(path,filename)
  164. if not osp.exists(filename_dir) or self.reload:
  165. response = self.download_file(url,filename)
  166. if response is None :
  167. return
  168. if not osp.exists(path) :
  169. os.makedirs(path)
  170. with open(filename_dir,'wb') as outfile :
  171. outfile.write(response.read())
  172. def dataset(self):
  173. if self.mode == "Tensorflow":
  174. return #something
  175. if self.mode == "Pytorch":
  176. return self.pytorch_dataset
  177. return self.dataset
  178. def info(self):
  179. print(self.info_dataset[self.name])
  180. def iter_load_dataset(self,data):
  181. results = []
  182. for datasets in data :
  183. results.append(loadDataset(osp.join(self.root,self.name,datasets)))
  184. return results
  185. def load_dataset(self,list_files):
  186. if self.name == "Ptc":
  187. if type(self.option) != str or self.option.upper() not in ['FR','FM','MM','MR']:
  188. raise ValueError('option for Ptc dataset needs to be one of : \n fr fm mm mr')
  189. results = []
  190. results.append(loadDataset(osp.join(self.root,self.name,'PTC/Test',self.gender + '.ds')))
  191. results.append(loadDataset(osp.join(self.root,self.name,'PTC/Train',self.gender + '.ds')))
  192. return results
  193. if self.name == "Pah":
  194. maximum_sets = 0
  195. for file in list_files:
  196. if file.endswith('ds'):
  197. maximum_sets = max(maximum_sets,int(file.split('_')[1].split('.')[0]))
  198. self.max_for_letter = maximum_sets
  199. if not type(self.option) == int or self.option > maximum_sets or self.option < 0:
  200. raise ValueError('option needs to be an integer between 0 and ' + str(maximum_sets))
  201. data = self.has_train_valid_test["Pah"]
  202. data[0] = self.has_train_valid_test["Pah"][0].split('_')[0] + '_' + str(self.option) + '.ds'
  203. data[1] = self.has_train_valid_test["Pah"][1].split('_')[0] + '_' + str(self.option) + '.ds'
  204. return self.iter_load_dataset(data)
  205. if self.name == "Letter":
  206. if type(self.option) == str and self.option.upper() in self.has_train_valid_test["Letter"]:
  207. data = self.has_train_valid_test["Letter"][self.option.upper()]
  208. else:
  209. message = "The parameter for letter is incorrect choose between : "
  210. message += "\nhigh med low"
  211. raise ValueError(message)
  212. return self.iter_load_dataset(data)
  213. if self.name in self.has_train_valid_test : #common IAM dataset with train, valid and test
  214. data = self.has_train_valid_test[self.name]
  215. return self.iter_load_dataset(data)
  216. else: #common dataset without train,valid and test, only dataset.ds file
  217. data = self.data_to_use_in_datasets[self.name]
  218. if len(data) > 1 and data[0] in list_files and data[1] in list_files: #case for Alkane
  219. return loadDataset(osp.join(self.root,self.name,data[0]),filename_y = osp.join(self.root,self.name,data[1]))
  220. if data in list_files:
  221. return loadDataset(osp.join(self.root,self.name,data))
  222. def open_files(self):
  223. filename = self.list_database[self.name][1]
  224. path = osp.join(self.root,self.name)
  225. filename_archive = osp.join(path,filename)
  226. if filename.endswith('gz'):
  227. if tarfile.is_tarfile(filename_archive):
  228. with tarfile.open(filename_archive,"r:gz") as tar:
  229. if self.reload:
  230. print(filename + " Downloaded")
  231. tar.extractall(path = path)
  232. return self.load_dataset(tar.getnames())
  233. elif filename.endswith('.tar'):
  234. if tarfile.is_tarfile(filename_archive):
  235. with tarfile.open(filename_archive,"r:") as tar:
  236. if self.reload :
  237. print(filename + " Downloaded")
  238. tar.extractall(path = path)
  239. return self.load_dataset(tar.getnames())
  240. elif filename.endswith('.zip'):
  241. with ZipFile(filename_archive,"r") as zip_ref:
  242. if self.reload :
  243. print(filename + " Downloaded")
  244. zip_ref.extractall(path)
  245. return self.load_dataset(zip_ref.namelist())
  246. else:
  247. print(filename + " Unsupported file")
  248. def build_dictionary(self,Gs):
  249. labels = set()
  250. #next line : from DeepGraphWithNNTorch
  251. #bond_type_number_maxi = int(max(max([[G[e[0]][e[1]]['bond_type'] for e in G.edges()] for G in Gs])))
  252. sizes = set()
  253. for G in Gs :
  254. for _,node in G.nodes(data = True): # or for node in nx.nodes(G)
  255. #print(_,node)
  256. labels.add(node["label"][0]) # labels.add(G.nodes[node]["label"][0]) #what do we use for IAM datasets (they don't have bond_type or event label) ?
  257. sizes.add(G.order())
  258. label_dict = {}
  259. #print("labels : ", labels, bond_type_number_maxi)
  260. for i,label in enumerate(labels):
  261. label_dict[label] = [0.]*len(labels)
  262. label_dict[label][i] = 1.
  263. return label_dict
  264. def from_networkx_to_pytorch(self,Gs,y):
  265. #exemple for MAO: atom_to_onehot = {'C': [1., 0., 0.], 'N': [0., 1., 0.], 'O': [0., 0., 1.]}
  266. # code from https://github.com/bgauzere/pygnn/blob/master/utils.py
  267. atom_to_onehot = self.build_dictionary(Gs)
  268. max_size = 30
  269. adjs = []
  270. inputs = []
  271. for i, G in enumerate(Gs):
  272. I = torch.eye(G.order(), G.order())
  273. #A = torch.Tensor(nx.adjacency_matrix(G).todense())
  274. #A = torch.Tensor(nx.to_numpy_matrix(G))
  275. A = torch.tensor(nx.to_scipy_sparse_matrix(G,dtype = int,weight = 'bond_type').todense(),dtype = torch.int) #what do we use for IAM datasets (they don't have bond_type or event label) ?
  276. adj = F.pad(A, pad=(0, max_size-G.order(), 0, max_size-G.order())) #add I now ? if yes : F.pad(A + I,pad = (...))
  277. adjs.append(adj)
  278. f_0 = []
  279. for _, label in G.nodes(data=True):
  280. #print(_,label)
  281. cur_label = atom_to_onehot[label['label'][0]].copy()
  282. f_0.append(cur_label)
  283. X = F.pad(torch.Tensor(f_0), pad=(0, 0, 0, max_size-G.order()))
  284. inputs.append(X)
  285. return inputs,adjs,y
  286. def from_pytorch_to_tensorflow(self,batch_size):
  287. seed = random.randrange(sys.maxsize)
  288. random.seed(seed)
  289. tf_inputs = random.sample(self.pytorch_dataset[0],batch_size)
  290. random.seed(seed)
  291. tf_y = random.sample(self.pytorch_dataset[2],batch_size)
  292. def from_networkx_to_tensor(self,G,dict):
  293. A=nx.to_numpy_matrix(G)
  294. lab=[dict[G.nodes[v]['label'][0]] for v in nx.nodes(G)]
  295. return (torch.tensor(A).view(1,A.shape[0]*A.shape[1]),torch.tensor(lab))
  296. def get_all_ds_infos(self, database):
  297. """Get information of all datasets from a database.
  298. Parameters
  299. ----------
  300. database : string
  301. DESCRIPTION.
  302. Returns
  303. -------
  304. None.
  305. """
  306. if database.lower() == 'tudataset':
  307. infos = self.get_all_tud_ds_infos()
  308. elif database.lower() == 'iam':
  309. pass
  310. else:
  311. msg = 'Invalid Database name "' + database + '"'
  312. msg += '\n Available databases are as follows: \n\n'
  313. msg += '\n'.join(db for db in sorted(DATABASES))
  314. raise ValueError(msg)
  315. return infos
  316. def get_all_tud_ds_infos(self):
  317. """Get information of all datasets from database TUDataset.
  318. Returns
  319. -------
  320. None.
  321. """
  322. try:
  323. response = urllib.request.urlopen(DATABASES['tudataset'])
  324. except urllib.error.HTTPError:
  325. print('The URL of the database "TUDataset" is not available:\n' + DATABASES['tudataset'])
  326. infos = {}
  327. # Get tables.
  328. h_str = response.read()
  329. tree = etree.HTML(h_str)
  330. tables = tree.xpath('//table')
  331. for table in tables:
  332. # Get the domain of the datasets.
  333. h2_nodes = table.getprevious()
  334. if h2_nodes is not None and h2_nodes.tag == 'h2':
  335. domain = h2_nodes.text.strip().lower()
  336. else:
  337. domain = ''
  338. # Get each line in the table.
  339. tr_nodes = table.xpath('tbody/tr')
  340. for tr in tr_nodes[1:]:
  341. # Get each element in the line.
  342. td_node = tr.xpath('td')
  343. # task type.
  344. cls_txt = td_node[3].text.strip()
  345. if not cls_txt.startswith('R'):
  346. class_number = int(cls_txt)
  347. task_type = 'classification'
  348. else:
  349. class_number = None
  350. task_type = 'regression'
  351. # node attrs.
  352. na_text = td_node[8].text.strip()
  353. if not na_text.startswith('+'):
  354. node_attr_dim = 0
  355. else:
  356. node_attr_dim = int(re.findall('\((.*)\)', na_text)[0])
  357. # edge attrs.
  358. ea_text = td_node[10].text.strip()
  359. if ea_text == 'temporal':
  360. edge_attr_dim = ea_text
  361. elif not ea_text.startswith('+'):
  362. edge_attr_dim = 0
  363. else:
  364. edge_attr_dim = int(re.findall('\((.*)\)', ea_text)[0])
  365. # geometry.
  366. geo_txt = td_node[9].text.strip()
  367. if geo_txt == '–':
  368. geometry = None
  369. else:
  370. geometry = geo_txt
  371. infos[td_node[0].xpath('strong')[0].text.strip()] = {
  372. 'database': 'tudataset',
  373. 'reference': td_node[1].text.strip(),
  374. 'dataset_size': int(td_node[2].text.strip()),
  375. 'class_number': class_number,
  376. 'task_type': task_type,
  377. 'ave_node_num': float(td_node[4].text.strip()),
  378. 'ave_edge_num': float(td_node[5].text.strip()),
  379. 'node_labeled': True if td_node[6].text.strip() == '+' else False,
  380. 'edge_labeled': True if td_node[7].text.strip() == '+' else False,
  381. 'node_attr_dim': node_attr_dim,
  382. 'geometry': geometry,
  383. 'edge_attr_dim': edge_attr_dim,
  384. 'url': td_node[11].xpath('a')[0].attrib['href'].strip(),
  385. 'domain': domain
  386. }
  387. return infos
  388. def pretty_ds_infos(self, infos):
  389. """Get the string that pretty prints the information of datasets.
  390. Parameters
  391. ----------
  392. datasets : dict
  393. The datasets' information.
  394. Returns
  395. -------
  396. p_str : string
  397. The pretty print of the datasets' information.
  398. """
  399. p_str = '{\n'
  400. for key, val in infos.items():
  401. p_str += '\t\'' + str(key) + '\': {\n'
  402. for k, v in val.items():
  403. p_str += '\t\t\'' + str(k) + '\': '
  404. if isinstance(v, str):
  405. p_str += '\'' + str(v) + '\',\n'
  406. else:
  407. p_str += '' + str(v) + ',\n'
  408. p_str += '\t},\n'
  409. p_str += '}'
  410. return p_str
  411. #dataset= selfopen_files()
  412. #print(build_dictionary(Gs))
  413. #dic={'C':0,'N':1,'O':2}
  414. #A,labels=from_networkx_to_tensor(Gs[13],dic)
  415. #print(nx.to_numpy_matrix(Gs[13]),labels)
  416. #print(A,labels)
  417. #@todo : from_networkx_to_tensorflow
  418. # dataloader = DataLoader('Acyclic',root = "database",option = 'high',mode = "Pytorch")
  419. # dataloader.info()
  420. # inputs,adjs,y = dataloader.pytorch_dataset
  421. # """
  422. # test,train,valid = dataloader.dataset
  423. # Gs,y = test
  424. # Gs2,y2 = train
  425. # Gs3,y3 = valid
  426. # """
  427. # #Gs,y = dataloader.
  428. # #print(Gs,y)
  429. # """
  430. # Gs,y = dataloader.dataset
  431. # for G in Gs :
  432. # for e in G.edges():
  433. # print(G[e[0]])
  434. # """
  435. # #for e in Gs[13].edges():
  436. # # print(Gs[13][e[0]])
  437. # #print(from_networkx_to_tensor(Gs[7],{'C': [1., 0., 0.], 'N': [0., 1., 0.], 'O': [0., 0., 1.]}))
  438. # #dataset.open_files()
  439. # import os
  440. # import os.path as osp
  441. # import urllib
  442. # import tarfile
  443. # from zipfile import ZipFile
  444. # from gklearn.utils.graphfiles import loadDataset
  445. # import torch
  446. # import torch.nn.functional as F
  447. # import networkx as nx
  448. # import matplotlib.pyplot as plt
  449. # import numpy as np
  450. #
  451. # def DataLoader(name,root = 'data',mode = "Networkx",downloadAll = False,reload = False,letter = "High",number = 0,gender = "MM"):
  452. # dir_name = "_".join(name.split("-"))
  453. # if not osp.exists(root) :
  454. # os.makedirs(root)
  455. # url = "https://brunl01.users.greyc.fr/CHEMISTRY/"
  456. # urliam = "https://iapr-tc15.greyc.fr/IAM/"
  457. # list_database = {
  458. # "Ace" : (url,"ACEDataset.tar"),
  459. # "Acyclic" : (url,"Acyclic.tar.gz"),
  460. # "Aids" : (urliam,"AIDS.zip"),
  461. # "Alkane" : (url,"alkane_dataset.tar.gz"),
  462. # "Chiral" : (url,"DatasetAcyclicChiral.tar"),
  463. # "Coil_Del" : (urliam,"COIL-DEL.zip"),
  464. # "Coil_Rag" : (urliam,"COIL-RAG.zip"),
  465. # "Fingerprint" : (urliam,"Fingerprint.zip"),
  466. # "Grec" : (urliam,"GREC.zip"),
  467. # "Letter" : (urliam,"Letter.zip"),
  468. # "Mao" : (url,"mao.tgz"),
  469. # "Monoterpenoides" : (url,"monoterpenoides.tar.gz"),
  470. # "Mutagenicity" : (urliam,"Mutagenicity.zip"),
  471. # "Pah" : (url,"PAH.tar.gz"),
  472. # "Protein" : (urliam,"Protein.zip"),
  473. # "Ptc" : (url,"ptc.tgz"),
  474. # "Steroid" : (url,"SteroidDataset.tar"),
  475. # "Vitamin" : (url,"DatasetVitamin.tar"),
  476. # "Web" : (urliam,"Web.zip")
  477. # }
  478. #
  479. # data_to_use_in_datasets = {
  480. # "Acyclic" : ("Acyclic/dataset_bps.ds"),
  481. # "Aids" : ("AIDS_A.txt"),
  482. # "Alkane" : ("Alkane/dataset.ds","Alkane/dataset_boiling_point_names.txt"),
  483. # "Mao" : ("MAO/dataset.ds"),
  484. # "Monoterpenoides" : ("monoterpenoides/dataset_10+.ds"), #('monoterpenoides/dataset.ds'),('monoterpenoides/dataset_9.ds'),('monoterpenoides/trainset_9.ds')
  485. #
  486. # }
  487. # has_train_valid_test = {
  488. # "Coil_Del" : ('COIL-DEL/data/test.cxl','COIL-DEL/data/train.cxl','COIL-DEL/data/valid.cxl'),
  489. # "Coil_Rag" : ('COIL-RAG/data/test.cxl','COIL-RAG/data/train.cxl','COIL-RAG/data/valid.cxl'),
  490. # "Fingerprint" : ('Fingerprint/data/test.cxl','Fingerprint/data/train.cxl','Fingerprint/data/valid.cxl'),
  491. # "Grec" : ('GREC/data/test.cxl','GREC/data/train.cxl','GREC/data/valid.cxl'),
  492. # "Letter" : {'HIGH' : ('Letter/HIGH/test.cxl','Letter/HIGH/train.cxl','Letter/HIGH/validation.cxl'),
  493. # 'MED' : ('Letter/MED/test.cxl','Letter/MED/train.cxl','Letter/MED/validation.cxl'),
  494. # 'LOW' : ('Letter/LOW/test.cxl','Letter/LOW/train.cxl','Letter/LOW/validation.cxl')
  495. # },
  496. # "Mutagenicity" : ('Mutagenicity/data/test.cxl','Mutagenicity/data/train.cxl','Mutagenicity/data/validation.cxl'),
  497. # "Pah" : ['PAH/testset_0.ds','PAH/trainset_0.ds'],
  498. # "Protein" : ('Protein/data/test.cxl','Protein/data/train.cxl','Protein/data/valid.cxl'),
  499. # "Web" : ('Web/data/test.cxl','Web/data/train.cxl','Web/data/valid.cxl')
  500. # }
  501. #
  502. # if not name :
  503. # raise ValueError("No dataset entered")
  504. # if name not in list_database:
  505. # message = "Invalid Dataset name " + name
  506. # message += '\n Available datasets are as follows : \n\n'
  507. # message += '\n'.join(database for database in list_database)
  508. # raise ValueError(message)
  509. #
  510. # def download_file(url,filename):
  511. # try :
  512. # response = urllib.request.urlopen(url + filename)
  513. # except urllib.error.HTTPError:
  514. # print(filename + " not available or incorrect http link")
  515. # return
  516. # return response
  517. #
  518. # def write_archive_file(root,database):
  519. # path = osp.join(root,database)
  520. # url,filename = list_database[database]
  521. # filename_dir = osp.join(path,filename)
  522. # if not osp.exists(filename_dir) or reload:
  523. # response = download_file(url,filename)
  524. # if response is None :
  525. # return
  526. # if not osp.exists(path) :
  527. # os.makedirs(path)
  528. # with open(filename_dir,'wb') as outfile :
  529. # outfile.write(response.read())
  530. #
  531. # if downloadAll :
  532. # print('Waiting...')
  533. # for database in list_database :
  534. # write_archive_file(root,database)
  535. # print('Downloading finished')
  536. # else:
  537. # write_archive_file(root,name)
  538. #
  539. # def iter_load_dataset(data):
  540. # results = []
  541. # for datasets in data :
  542. # results.append(loadDataset(osp.join(root,name,datasets)))
  543. # return results
  544. #
  545. # def load_dataset(list_files):
  546. # if name == "Ptc":
  547. # if gender.upper() not in ['FR','FM','MM','MR']:
  548. # raise ValueError('gender chosen needs to be one of \n fr fm mm mr')
  549. # results = []
  550. # results.append(loadDataset(osp.join(root,name,'PTC/Test',gender.upper() + '.ds')))
  551. # results.append(loadDataset(osp.join(root,name,'PTC/Train',gender.upper() + '.ds')))
  552. # return results
  553. # if name == "Pah":
  554. # maximum_sets = 0
  555. # for file in list_files:
  556. # if file.endswith('ds'):
  557. # maximum_sets = max(maximum_sets,int(file.split('_')[1].split('.')[0]))
  558. # if number > maximum_sets :
  559. # raise ValueError("Please select a dataset with number less than " + str(maximum_sets + 1))
  560. # data = has_train_valid_test["Pah"]
  561. # data[0] = has_train_valid_test["Pah"][0].split('_')[0] + '_' + str(number) + '.ds'
  562. # data[1] = has_train_valid_test["Pah"][1].split('_')[0] + '_' + str(number) + '.ds'
  563. # #print(data)
  564. # return iter_load_dataset(data)
  565. # if name == "Letter":
  566. # if letter.upper() in has_train_valid_test["Letter"]:
  567. # data = has_train_valid_test["Letter"][letter.upper()]
  568. # else:
  569. # message = "The parameter for letter is incorrect choose between : "
  570. # message += "\nhigh med low"
  571. # raise ValueError(message)
  572. # results = []
  573. # for datasets in data:
  574. # results.append(loadDataset(osp.join(root,name,datasets)))
  575. # return results
  576. # if name in has_train_valid_test : #common IAM dataset with train, valid and test
  577. # data = has_train_valid_test[name]
  578. # results = []
  579. # for datasets in data :
  580. # results.append(loadDataset(osp.join(root,name,datasets)))
  581. # return results
  582. # else: #common dataset without train,valid and test, only dataset.ds file
  583. # data = data_to_use_in_datasets[name]
  584. # if len(data) > 1 and data[0] in list_files and data[1] in list_files:
  585. # return loadDataset(osp.join(root,name,data[0]),filename_y = osp.join(root,name,data[1]))
  586. # if data in list_files:
  587. # return loadDataset(osp.join(root,name,data))
  588. # def open_files():
  589. # filename = list_database[name][1]
  590. # path = osp.join(root,name)
  591. # filename_archive = osp.join(root,name,filename)
  592. #
  593. # if filename.endswith('gz'):
  594. # if tarfile.is_tarfile(filename_archive):
  595. # with tarfile.open(filename_archive,"r:gz") as tar:
  596. # if reload:
  597. # print(filename + " Downloaded")
  598. # tar.extractall(path = path)
  599. # return load_dataset(tar.getnames())
  600. # #raise ValueError("dataset not available")
  601. #
  602. #
  603. # elif filename.endswith('.tar'):
  604. # if tarfile.is_tarfile(filename_archive):
  605. # with tarfile.open(filename_archive,"r:") as tar:
  606. # if reload :
  607. # print(filename + " Downloaded")
  608. # tar.extractall(path = path)
  609. # return load_dataset(tar.getnames())
  610. # elif filename.endswith('.zip'):
  611. # with ZipFile(filename_archive,"r") as zip_ref:
  612. # if reload :
  613. # print(filename + " Downloaded")
  614. # zip_ref.extractall(path)
  615. # return load_dataset(zip_ref.namelist())
  616. # else:
  617. # print(filename + " Unsupported file")
  618. # """
  619. # with tarfile.open(osp.join(root,name,list_database[name][1]),"r:gz") as files:
  620. # for file in files.getnames():
  621. # print(file)
  622. # """
  623. #
  624. # def build_dictionary(Gs):
  625. # labels = set()
  626. # bond_type_number_maxi = int(max(max([[G[e[0]][e[1]]['bond_type'] for e in G.edges()] for G in Gs])))
  627. # print(bond_type_number_maxi)
  628. # sizes = set()
  629. # for G in Gs :
  630. # for _,node in G.nodes(data = True): # or for node in nx.nodes(G)
  631. # #print(node)
  632. # labels.add(node["label"][0]) # labels.add(G.nodes[node]["label"][0])
  633. # sizes.add(G.order())
  634. # if len(labels) >= bond_type_number_maxi:
  635. # break
  636. # label_dict = {}
  637. # for i,label in enumerate(labels):
  638. # label_dict[label] = [0.]*bond_type_number_maxi
  639. # label_dict[label][i] = 1.
  640. # return label_dict
  641. #
  642. # def from_networkx_to_pytorch(Gs):
  643. # #exemple : atom_to_onehot = {'C': [1., 0., 0.], 'N': [0., 1., 0.], 'O': [0., 0., 1.]}
  644. # # code from https://github.com/bgauzere/pygnn/blob/master/utils.py
  645. # atom_to_onehot = build_dictionary(Gs)
  646. # max_size = 30
  647. # adjs = []
  648. # inputs = []
  649. # for i, G in enumerate(Gs):
  650. # I = torch.eye(G.order(), G.order())
  651. # A = torch.Tensor(nx.adjacency_matrix(G).todense())
  652. # A = torch.tensor(nx.to_scipy_sparse_matrix(G,dtype = int,weight = 'bond_type').todense(),dtype = torch.int)
  653. # adj = F.pad(A+I, pad=(0, max_size-G.order(), 0, max_size-G.order())) #add I now ?
  654. # adjs.append(adj)
  655. # f_0 = []
  656. # for _, label in G.nodes(data=True):
  657. # #print(_,label)
  658. # cur_label = atom_to_onehot[label['label'][0]].copy()
  659. # f_0.append(cur_label)
  660. # X = F.pad(torch.Tensor(f_0), pad=(0, 0, 0, max_size-G.order()))
  661. # inputs.append(X)
  662. # return inputs,adjs,y
  663. #
  664. # def from_networkx_to_tensor(G,dict):
  665. # A=nx.to_numpy_matrix(G)
  666. # lab=[dict[G.nodes[v]['label'][0]] for v in nx.nodes(G)]
  667. # return (torch.tensor(A).view(1,A.shape[0]*A.shape[1]),torch.tensor(lab))
  668. #
  669. # dataset= open_files()
  670. # #print(build_dictionary(Gs))
  671. # #dic={'C':0,'N':1,'O':2}
  672. # #A,labels=from_networkx_to_tensor(Gs[13],dic)
  673. # #print(nx.to_numpy_matrix(Gs[13]),labels)
  674. # #print(A,labels)
  675. #
  676. # """
  677. # for G in Gs :
  678. # for node in nx.nodes(G):
  679. # print(G.nodes[node])
  680. # """
  681. # if mode == "pytorch":
  682. # Gs,y = dataset
  683. # inputs,adjs,y = from_networkx_to_pytorch(Gs)
  684. # print(inputs,adjs)
  685. # return inputs,adjs,y
  686. #
  687. #
  688. # """
  689. # dic = dict()
  690. # for i,l in enumerate(label):
  691. # dic[l] = i
  692. # dic = {'C': 0, 'N': 1, 'O': 2}
  693. # A,labels=from_networkx_to_tensor(Gs[0],dic)
  694. # #print(A,labels)
  695. # return A,labels
  696. # """
  697. #
  698. # return dataset
  699. #
  700. # #open_files()
  701. #
  702. # def label_to_color(label):
  703. # if label == 'C':
  704. # return 0.1
  705. # elif label == 'O':
  706. # return 0.8
  707. #
  708. # def nodes_to_color_sequence(G):
  709. # return [label_to_color(c[1]['label'][0]) for c in G.nodes(data=True)]
  710. # ##############
  711. # """
  712. # dataset = DataLoader('Mao',root = "database")
  713. # print(dataset)
  714. # Gs,y = dataset
  715. # """
  716. # """
  717. # dataset = DataLoader('Alkane',root = "database") # Gs is empty here whereas y isn't -> not working
  718. # Gs,y = dataset
  719. # """
  720. # """
  721. # dataset = DataLoader('Acyclic', root = "database")
  722. # Gs,y = dataset
  723. # """
  724. # """
  725. # dataset = DataLoader('Monoterpenoides', root = "database")
  726. # Gs,y = dataset
  727. # """
  728. # """
  729. # dataset = DataLoader('Pah',root = 'database', number = 8)
  730. # test_set,train_set = dataset
  731. # Gs,y = test_set
  732. # Gs2,y2 = train_set
  733. # """
  734. # """
  735. # dataset = DataLoader('Coil_Del',root = "database")
  736. # test,train,valid = dataset
  737. # Gs,y = test
  738. # Gs2,y2 = train
  739. # Gs3, y3 = valid
  740. # """
  741. # """
  742. # dataset = DataLoader('Coil_Rag',root = "database")
  743. # test,train,valid = dataset
  744. # Gs,y = test
  745. # Gs2,y2 = train
  746. # Gs3, y3 = valid
  747. # """
  748. # """
  749. # dataset = DataLoader('Fingerprint',root = "database")
  750. # test,train,valid = dataset
  751. # Gs,y = test
  752. # Gs2,y2 = train
  753. # Gs3, y3 = valid
  754. # """
  755. # """
  756. # dataset = DataLoader('Grec',root = "database")
  757. # test,train,valid = dataset
  758. # Gs,y = test
  759. # Gs2,y2 = train
  760. # Gs3, y3 = valid
  761. # """
  762. # """
  763. # dataset = DataLoader('Letter',root = "database",letter = 'low') #high low med
  764. # test,train,valid = dataset
  765. # Gs,y = test
  766. # Gs2,y2 = train
  767. # Gs3, y3 = valid
  768. # """
  769. # """
  770. # dataset = DataLoader('Mutagenicity',root = "database")
  771. # test,train,valid = dataset
  772. # Gs,y = test
  773. # Gs2,y2 = train
  774. # Gs3, y3 = valid
  775. # """
  776. # """
  777. # dataset = DataLoader('Protein',root = "database")
  778. # test,train,valid = dataset
  779. # Gs,y = test
  780. # Gs2,y2 = train
  781. # Gs3, y3 = valid
  782. # """
  783. # """
  784. # dataset = DataLoader('Ptc', root = "database",gender = 'fm') # not working, Gs and y are empty perhaps issue coming from loadDataset
  785. # valid,train = dataset
  786. # Gs,y = valid
  787. # Gs2,y2 = train
  788. # """
  789. # """
  790. # dataset = DataLoader('Web', root = "database")
  791. # test,train,valid = dataset
  792. # Gs,y = test
  793. # Gs2,y2 = train
  794. # Gs3,y3 = valid
  795. # """
  796. # print(Gs,y)
  797. # print(len(dataset))
  798. # ##############
  799. # #print('edge max label',max(max([[G[e[0]][e[1]]['bond_type'] for e in G.edges()] for G in Gs])))
  800. # G1 = Gs[13]
  801. # G2 = Gs[23]
  802. # """
  803. # nx.draw_networkx(G1,with_labels=True,node_color = nodes_to_color_sequence(G1),cmap='autumn')
  804. # plt.figure()
  805. # nx.draw_networkx(G2,with_labels=True,node_color = nodes_to_color_sequence(G2),cmap='autumn')
  806. # """
  807. # from pathlib import Path
  808. # DATA_PATH = Path("data")
  809. # def import_datasets():
  810. #
  811. # import urllib
  812. # import tarfile
  813. # from zipfile import ZipFile
  814. # URL = "https://brunl01.users.greyc.fr/CHEMISTRY/"
  815. # URLIAM = "https://iapr-tc15.greyc.fr/IAM/"
  816. #
  817. # LIST_DATABASE = {
  818. # "Pah" : (URL,"PAH.tar.gz"),
  819. # "Mao" : (URL,"mao.tgz"),
  820. # "Ptc" : (URL,"ptc.tgz"),
  821. # "Aids" : (URLIAM,"AIDS.zip"),
  822. # "Acyclic" : (URL,"Acyclic.tar.gz"),
  823. # "Alkane" : (URL,"alkane_dataset.tar.gz"),
  824. # "Chiral" : (URL,"DatasetAcyclicChiral.tar"),
  825. # "Vitamin" : (URL,"DatasetVitamin.tar"),
  826. # "Ace" : (URL,"ACEDataset.tar"),
  827. # "Steroid" : (URL,"SteroidDataset.tar"),
  828. # "Monoterpenoides" : (URL,"monoterpenoides.tar.gz"),
  829. # "Letter" : (URLIAM,"Letter.zip"),
  830. # "Grec" : (URLIAM,"GREC.zip"),
  831. # "Fingerprint" : (URLIAM,"Fingerprint.zip"),
  832. # "Coil_Rag" : (URLIAM,"COIL-RAG.zip"),
  833. # "Coil_Del" : (URLIAM,"COIL-DEL.zip"),
  834. # "Web" : (URLIAM,"Web.zip"),
  835. # "Mutagenicity" : (URLIAM,"Mutagenicity.zip"),
  836. # "Protein" : (URLIAM,"Protein.zip")
  837. # }
  838. # print("Select databases in the list. Select multiple, split by white spaces .\nWrite All to select all of them.\n")
  839. # print(', '.join(database for database in LIST_DATABASE))
  840. # print("Choice : ",end = ' ')
  841. # selected_databases = input().split()
  842. #
  843. # def download_file(url,filename):
  844. # try :
  845. # response = urllib.request.urlopen(url + filename)
  846. # except urllib.error.HTTPError:
  847. # print(filename + " not available or incorrect http link")
  848. # return
  849. # return response
  850. #
  851. # def write_archive_file(database):
  852. #
  853. # PATH = DATA_PATH / database
  854. # url,filename = LIST_DATABASE[database]
  855. # if not (PATH / filename).exists():
  856. # response = download_file(url,filename)
  857. # if response is None :
  858. # return
  859. # if not PATH.exists() :
  860. # PATH.mkdir(parents=True, exist_ok=True)
  861. # with open(PATH/filename,'wb') as outfile :
  862. # outfile.write(response.read())
  863. #
  864. # if filename[-2:] == 'gz':
  865. # if tarfile.is_tarfile(PATH/filename):
  866. # with tarfile.open(PATH/filename,"r:gz") as tar:
  867. # tar.extractall(path = PATH)
  868. # print(filename + ' Downloaded')
  869. # elif filename[-3:] == 'tar':
  870. # if tarfile.is_tarfile(PATH/filename):
  871. # with tarfile.open(PATH/filename,"r:") as tar:
  872. # tar.extractall(path = PATH)
  873. # print(filename + ' Downloaded')
  874. # elif filename[-3:] == 'zip':
  875. # with ZipFile(PATH/filename,"r") as zip_ref:
  876. # zip_ref.extractall(PATH)
  877. # print(filename + ' Downloaded')
  878. # else:
  879. # print("Unsupported file")
  880. # if 'All' in selected_databases:
  881. # print('Waiting...')
  882. # for database in LIST_DATABASE :
  883. # write_archive_file(database)
  884. # print('Finished')
  885. # else:
  886. # print('Waiting...')
  887. # for database in selected_databases :
  888. # if database in LIST_DATABASE :
  889. # write_archive_file(database)
  890. # print('Finished')
  891. # import_datasets()
  892. # class GraphFetcher(object):
  893. #
  894. #
  895. # def __init__(self, filename=None, filename_targets=None, **kwargs):
  896. # if filename is None:
  897. # self._graphs = None
  898. # self._targets = None
  899. # self._node_labels = None
  900. # self._edge_labels = None
  901. # self._node_attrs = None
  902. # self._edge_attrs = None
  903. # else:
  904. # self.load_dataset(filename, filename_targets=filename_targets, **kwargs)
  905. #
  906. # self._substructures = None
  907. # self._node_label_dim = None
  908. # self._edge_label_dim = None
  909. # self._directed = None
  910. # self._dataset_size = None
  911. # self._total_node_num = None
  912. # self._ave_node_num = None
  913. # self._min_node_num = None
  914. # self._max_node_num = None
  915. # self._total_edge_num = None
  916. # self._ave_edge_num = None
  917. # self._min_edge_num = None
  918. # self._max_edge_num = None
  919. # self._ave_node_degree = None
  920. # self._min_node_degree = None
  921. # self._max_node_degree = None
  922. # self._ave_fill_factor = None
  923. # self._min_fill_factor = None
  924. # self._max_fill_factor = None
  925. # self._node_label_nums = None
  926. # self._edge_label_nums = None
  927. # self._node_attr_dim = None
  928. # self._edge_attr_dim = None
  929. # self._class_number = None
  930. #
  931. #
  932. # def load_dataset(self, filename, filename_targets=None, **kwargs):
  933. # self._graphs, self._targets, label_names = load_dataset(filename, filename_targets=filename_targets, **kwargs)
  934. # self._node_labels = label_names['node_labels']
  935. # self._node_attrs = label_names['node_attrs']
  936. # self._edge_labels = label_names['edge_labels']
  937. # self._edge_attrs = label_names['edge_attrs']
  938. # self.clean_labels()
  939. #
  940. #
  941. # def load_graphs(self, graphs, targets=None):
  942. # # this has to be followed by set_labels().
  943. # self._graphs = graphs
  944. # self._targets = targets
  945. # # self.set_labels_attrs() # @todo
  946. #
  947. #
  948. # def load_predefined_dataset(self, ds_name):
  949. # current_path = os.path.dirname(os.path.realpath(__file__)) + '/'
  950. # if ds_name == 'Acyclic':
  951. # ds_file = current_path + '../../datasets/Acyclic/dataset_bps.ds'
  952. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  953. # elif ds_name == 'AIDS':
  954. # ds_file = current_path + '../../datasets/AIDS/AIDS_A.txt'
  955. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  956. # elif ds_name == 'Alkane':
  957. # ds_file = current_path + '../../datasets/Alkane/dataset.ds'
  958. # fn_targets = current_path + '../../datasets/Alkane/dataset_boiling_point_names.txt'
  959. # self._graphs, self._targets, label_names = load_dataset(ds_file, filename_targets=fn_targets)
  960. # elif ds_name == 'COIL-DEL':
  961. # ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt'
  962. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  963. # elif ds_name == 'COIL-RAG':
  964. # ds_file = current_path + '../../datasets/COIL-RAG/COIL-RAG_A.txt'
  965. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  966. # elif ds_name == 'COLORS-3':
  967. # ds_file = current_path + '../../datasets/COLORS-3/COLORS-3_A.txt'
  968. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  969. # elif ds_name == 'Cuneiform':
  970. # ds_file = current_path + '../../datasets/Cuneiform/Cuneiform_A.txt'
  971. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  972. # elif ds_name == 'DD':
  973. # ds_file = current_path + '../../datasets/DD/DD_A.txt'
  974. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  975. # elif ds_name == 'ENZYMES':
  976. # ds_file = current_path + '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'
  977. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  978. # elif ds_name == 'Fingerprint':
  979. # ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt'
  980. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  981. # elif ds_name == 'FRANKENSTEIN':
  982. # ds_file = current_path + '../../datasets/FRANKENSTEIN/FRANKENSTEIN_A.txt'
  983. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  984. # elif ds_name == 'Letter-high': # node non-symb
  985. # ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt'
  986. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  987. # elif ds_name == 'Letter-low': # node non-symb
  988. # ds_file = current_path + '../../datasets/Letter-low/Letter-low_A.txt'
  989. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  990. # elif ds_name == 'Letter-med': # node non-symb
  991. # ds_file = current_path + '../../datasets/Letter-med/Letter-med_A.txt'
  992. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  993. # elif ds_name == 'MAO':
  994. # ds_file = current_path + '../../datasets/MAO/dataset.ds'
  995. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  996. # elif ds_name == 'Monoterpenoides':
  997. # ds_file = current_path + '../../datasets/Monoterpenoides/dataset_10+.ds'
  998. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  999. # elif ds_name == 'MUTAG':
  1000. # ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt'
  1001. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  1002. # elif ds_name == 'NCI1':
  1003. # ds_file = current_path + '../../datasets/NCI1/NCI1_A.txt'
  1004. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  1005. # elif ds_name == 'NCI109':
  1006. # ds_file = current_path + '../../datasets/NCI109/NCI109_A.txt'
  1007. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  1008. # elif ds_name == 'PAH':
  1009. # ds_file = current_path + '../../datasets/PAH/dataset.ds'
  1010. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  1011. # elif ds_name == 'SYNTHETIC':
  1012. # pass
  1013. # elif ds_name == 'SYNTHETICnew':
  1014. # ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt'
  1015. # self._graphs, self._targets, label_names = load_dataset(ds_file)
  1016. # elif ds_name == 'Synthie':
  1017. # pass
  1018. # else:
  1019. # raise Exception('The dataset name "', ds_name, '" is not pre-defined.')
  1020. #
  1021. # self._node_labels = label_names['node_labels']
  1022. # self._node_attrs = label_names['node_attrs']
  1023. # self._edge_labels = label_names['edge_labels']
  1024. # self._edge_attrs = label_names['edge_attrs']
  1025. # self.clean_labels()
  1026. #
  1027. # def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]):
  1028. # self._node_labels = node_labels
  1029. # self._node_attrs = node_attrs
  1030. # self._edge_labels = edge_labels
  1031. # self._edge_attrs = edge_attrs
  1032. #
  1033. # def set_labels_attrs(self, node_labels=None, node_attrs=None, edge_labels=None, edge_attrs=None):
  1034. # # @todo: remove labels which have only one possible values.
  1035. # if node_labels is None:
  1036. # self._node_labels = self._graphs[0].graph['node_labels']
  1037. # # # graphs are considered node unlabeled if all nodes have the same label.
  1038. # # infos.update({'node_labeled': is_nl if node_label_num > 1 else False})
  1039. # if node_attrs is None:
  1040. # self._node_attrs = self._graphs[0].graph['node_attrs']
  1041. # # for G in Gn:
  1042. # # for n in G.nodes(data=True):
  1043. # # if 'attributes' in n[1]:
  1044. # # return len(n[1]['attributes'])
  1045. # # return 0
  1046. # if edge_labels is None:
  1047. # self._edge_labels = self._graphs[0].graph['edge_labels']
  1048. # # # graphs are considered edge unlabeled if all edges have the same label.
  1049. # # infos.update({'edge_labeled': is_el if edge_label_num > 1 else False})
  1050. # if edge_attrs is None:
  1051. # self._edge_attrs = self._graphs[0].graph['edge_attrs']
  1052. # # for G in Gn:
  1053. # # if nx.number_of_edges(G) > 0:
  1054. # # for e in G.edges(data=True):
  1055. # # if 'attributes' in e[2]:
  1056. # # return len(e[2]['attributes'])
  1057. # # return 0
  1058. #
  1059. #
  1060. # def get_dataset_infos(self, keys=None, params=None):
  1061. # """Computes and returns the structure and property information of the graph dataset.
  1062. #
  1063. # Parameters
  1064. # ----------
  1065. # keys : list, optional
  1066. # A list of strings which indicate which informations will be returned. The
  1067. # possible choices includes:
  1068. #
  1069. # 'substructures': sub-structures graphs contains, including 'linear', 'non
  1070. # linear' and 'cyclic'.
  1071. #
  1072. # 'node_label_dim': whether vertices have symbolic labels.
  1073. #
  1074. # 'edge_label_dim': whether egdes have symbolic labels.
  1075. #
  1076. # 'directed': whether graphs in dataset are directed.
  1077. #
  1078. # 'dataset_size': number of graphs in dataset.
  1079. #
  1080. # 'total_node_num': total number of vertices of all graphs in dataset.
  1081. #
  1082. # 'ave_node_num': average number of vertices of graphs in dataset.
  1083. #
  1084. # 'min_node_num': minimum number of vertices of graphs in dataset.
  1085. #
  1086. # 'max_node_num': maximum number of vertices of graphs in dataset.
  1087. #
  1088. # 'total_edge_num': total number of edges of all graphs in dataset.
  1089. #
  1090. # 'ave_edge_num': average number of edges of graphs in dataset.
  1091. #
  1092. # 'min_edge_num': minimum number of edges of graphs in dataset.
  1093. #
  1094. # 'max_edge_num': maximum number of edges of graphs in dataset.
  1095. #
  1096. # 'ave_node_degree': average vertex degree of graphs in dataset.
  1097. #
  1098. # 'min_node_degree': minimum vertex degree of graphs in dataset.
  1099. #
  1100. # 'max_node_degree': maximum vertex degree of graphs in dataset.
  1101. #
  1102. # 'ave_fill_factor': average fill factor (number_of_edges /
  1103. # (number_of_nodes ** 2)) of graphs in dataset.
  1104. #
  1105. # 'min_fill_factor': minimum fill factor of graphs in dataset.
  1106. #
  1107. # 'max_fill_factor': maximum fill factor of graphs in dataset.
  1108. #
  1109. # 'node_label_nums': list of numbers of symbolic vertex labels of graphs in dataset.
  1110. #
  1111. # 'edge_label_nums': list number of symbolic edge labels of graphs in dataset.
  1112. #
  1113. # 'node_attr_dim': number of dimensions of non-symbolic vertex labels.
  1114. # Extracted from the 'attributes' attribute of graph nodes.
  1115. #
  1116. # 'edge_attr_dim': number of dimensions of non-symbolic edge labels.
  1117. # Extracted from the 'attributes' attribute of graph edges.
  1118. #
  1119. # 'class_number': number of classes. Only available for classification problems.
  1120. #
  1121. # 'all_degree_entropy': the entropy of degree distribution of each graph.
  1122. #
  1123. # 'ave_degree_entropy': the average entropy of degree distribution of all graphs.
  1124. #
  1125. # All informations above will be returned if `keys` is not given.
  1126. #
  1127. # params: dict of dict, optional
  1128. # A dictinary which contains extra parameters for each possible
  1129. # element in ``keys``.
  1130. #
  1131. # Return
  1132. # ------
  1133. # dict
  1134. # Information of the graph dataset keyed by `keys`.
  1135. # """
  1136. # infos = {}
  1137. #
  1138. # if keys == None:
  1139. # keys = [
  1140. # 'substructures',
  1141. # 'node_label_dim',
  1142. # 'edge_label_dim',
  1143. # 'directed',
  1144. # 'dataset_size',
  1145. # 'total_node_num',
  1146. # 'ave_node_num',
  1147. # 'min_node_num',
  1148. # 'max_node_num',
  1149. # 'total_edge_num',
  1150. # 'ave_edge_num',
  1151. # 'min_edge_num',
  1152. # 'max_edge_num',
  1153. # 'ave_node_degree',
  1154. # 'min_node_degree',
  1155. # 'max_node_degree',
  1156. # 'ave_fill_factor',
  1157. # 'min_fill_factor',
  1158. # 'max_fill_factor',
  1159. # 'node_label_nums',
  1160. # 'edge_label_nums',
  1161. # 'node_attr_dim',
  1162. # 'edge_attr_dim',
  1163. # 'class_number',
  1164. # 'all_degree_entropy',
  1165. # 'ave_degree_entropy'
  1166. # ]
  1167. #
  1168. # # dataset size
  1169. # if 'dataset_size' in keys:
  1170. # if self._dataset_size is None:
  1171. # self._dataset_size = self._get_dataset_size()
  1172. # infos['dataset_size'] = self._dataset_size
  1173. #
  1174. # # graph node number
  1175. # if any(i in keys for i in ['total_node_num', 'ave_node_num', 'min_node_num', 'max_node_num']):
  1176. # all_node_nums = self._get_all_node_nums()
  1177. # if 'total_node_num' in keys:
  1178. # if self._total_node_num is None:
  1179. # self._total_node_num = self._get_total_node_num(all_node_nums)
  1180. # infos['total_node_num'] = self._total_node_num
  1181. #
  1182. # if 'ave_node_num' in keys:
  1183. # if self._ave_node_num is None:
  1184. # self._ave_node_num = self._get_ave_node_num(all_node_nums)
  1185. # infos['ave_node_num'] = self._ave_node_num
  1186. #
  1187. # if 'min_node_num' in keys:
  1188. # if self._min_node_num is None:
  1189. # self._min_node_num = self._get_min_node_num(all_node_nums)
  1190. # infos['min_node_num'] = self._min_node_num
  1191. #
  1192. # if 'max_node_num' in keys:
  1193. # if self._max_node_num is None:
  1194. # self._max_node_num = self._get_max_node_num(all_node_nums)
  1195. # infos['max_node_num'] = self._max_node_num
  1196. #
  1197. # # graph edge number
  1198. # if any(i in keys for i in ['total_edge_num', 'ave_edge_num', 'min_edge_num', 'max_edge_num']):
  1199. # all_edge_nums = self._get_all_edge_nums()
  1200. # if 'total_edge_num' in keys:
  1201. # if self._total_edge_num is None:
  1202. # self._total_edge_num = self._get_total_edge_num(all_edge_nums)
  1203. # infos['total_edge_num'] = self._total_edge_num
  1204. #
  1205. # if 'ave_edge_num' in keys:
  1206. # if self._ave_edge_num is None:
  1207. # self._ave_edge_num = self._get_ave_edge_num(all_edge_nums)
  1208. # infos['ave_edge_num'] = self._ave_edge_num
  1209. #
  1210. # if 'max_edge_num' in keys:
  1211. # if self._max_edge_num is None:
  1212. # self._max_edge_num = self._get_max_edge_num(all_edge_nums)
  1213. # infos['max_edge_num'] = self._max_edge_num
  1214. # if 'min_edge_num' in keys:
  1215. # if self._min_edge_num is None:
  1216. # self._min_edge_num = self._get_min_edge_num(all_edge_nums)
  1217. # infos['min_edge_num'] = self._min_edge_num
  1218. #
  1219. # # label number
  1220. # if 'node_label_dim' in keys:
  1221. # if self._node_label_dim is None:
  1222. # self._node_label_dim = self._get_node_label_dim()
  1223. # infos['node_label_dim'] = self._node_label_dim
  1224. #
  1225. # if 'node_label_nums' in keys:
  1226. # if self._node_label_nums is None:
  1227. # self._node_label_nums = {}
  1228. # for node_label in self._node_labels:
  1229. # self._node_label_nums[node_label] = self._get_node_label_num(node_label)
  1230. # infos['node_label_nums'] = self._node_label_nums
  1231. #
  1232. # if 'edge_label_dim' in keys:
  1233. # if self._edge_label_dim is None:
  1234. # self._edge_label_dim = self._get_edge_label_dim()
  1235. # infos['edge_label_dim'] = self._edge_label_dim
  1236. #
  1237. # if 'edge_label_nums' in keys:
  1238. # if self._edge_label_nums is None:
  1239. # self._edge_label_nums = {}
  1240. # for edge_label in self._edge_labels:
  1241. # self._edge_label_nums[edge_label] = self._get_edge_label_num(edge_label)
  1242. # infos['edge_label_nums'] = self._edge_label_nums
  1243. #
  1244. # if 'directed' in keys or 'substructures' in keys:
  1245. # if self._directed is None:
  1246. # self._directed = self._is_directed()
  1247. # infos['directed'] = self._directed
  1248. #
  1249. # # node degree
  1250. # if any(i in keys for i in ['ave_node_degree', 'max_node_degree', 'min_node_degree']):
  1251. # all_node_degrees = self._get_all_node_degrees()
  1252. #
  1253. # if 'ave_node_degree' in keys:
  1254. # if self._ave_node_degree is None:
  1255. # self._ave_node_degree = self._get_ave_node_degree(all_node_degrees)
  1256. # infos['ave_node_degree'] = self._ave_node_degree
  1257. #
  1258. # if 'max_node_degree' in keys:
  1259. # if self._max_node_degree is None:
  1260. # self._max_node_degree = self._get_max_node_degree(all_node_degrees)
  1261. # infos['max_node_degree'] = self._max_node_degree
  1262. #
  1263. # if 'min_node_degree' in keys:
  1264. # if self._min_node_degree is None:
  1265. # self._min_node_degree = self._get_min_node_degree(all_node_degrees)
  1266. # infos['min_node_degree'] = self._min_node_degree
  1267. #
  1268. # # fill factor
  1269. # if any(i in keys for i in ['ave_fill_factor', 'max_fill_factor', 'min_fill_factor']):
  1270. # all_fill_factors = self._get_all_fill_factors()
  1271. #
  1272. # if 'ave_fill_factor' in keys:
  1273. # if self._ave_fill_factor is None:
  1274. # self._ave_fill_factor = self._get_ave_fill_factor(all_fill_factors)
  1275. # infos['ave_fill_factor'] = self._ave_fill_factor
  1276. #
  1277. # if 'max_fill_factor' in keys:
  1278. # if self._max_fill_factor is None:
  1279. # self._max_fill_factor = self._get_max_fill_factor(all_fill_factors)
  1280. # infos['max_fill_factor'] = self._max_fill_factor
  1281. #
  1282. # if 'min_fill_factor' in keys:
  1283. # if self._min_fill_factor is None:
  1284. # self._min_fill_factor = self._get_min_fill_factor(all_fill_factors)
  1285. # infos['min_fill_factor'] = self._min_fill_factor
  1286. #
  1287. # if 'substructures' in keys:
  1288. # if self._substructures is None:
  1289. # self._substructures = self._get_substructures()
  1290. # infos['substructures'] = self._substructures
  1291. #
  1292. # if 'class_number' in keys:
  1293. # if self._class_number is None:
  1294. # self._class_number = self._get_class_number()
  1295. # infos['class_number'] = self._class_number
  1296. #
  1297. # if 'node_attr_dim' in keys:
  1298. # if self._node_attr_dim is None:
  1299. # self._node_attr_dim = self._get_node_attr_dim()
  1300. # infos['node_attr_dim'] = self._node_attr_dim
  1301. #
  1302. # if 'edge_attr_dim' in keys:
  1303. # if self._edge_attr_dim is None:
  1304. # self._edge_attr_dim = self._get_edge_attr_dim()
  1305. # infos['edge_attr_dim'] = self._edge_attr_dim
  1306. #
  1307. # # entropy of degree distribution.
  1308. #
  1309. # if 'all_degree_entropy' in keys:
  1310. # if params is not None and ('all_degree_entropy' in params) and ('base' in params['all_degree_entropy']):
  1311. # base = params['all_degree_entropy']['base']
  1312. # else:
  1313. # base = None
  1314. # infos['all_degree_entropy'] = self._compute_all_degree_entropy(base=base)
  1315. #
  1316. # if 'ave_degree_entropy' in keys:
  1317. # if params is not None and ('ave_degree_entropy' in params) and ('base' in params['ave_degree_entropy']):
  1318. # base = params['ave_degree_entropy']['base']
  1319. # else:
  1320. # base = None
  1321. # infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base))
  1322. #
  1323. # return infos
  1324. #
  1325. #
  1326. # def print_graph_infos(self, infos):
  1327. # from collections import OrderedDict
  1328. # keys = list(infos.keys())
  1329. # print(OrderedDict(sorted(infos.items(), key=lambda i: keys.index(i[0]))))
  1330. #
  1331. #
  1332. # def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]):
  1333. # node_labels = [item for item in node_labels if item in self._node_labels]
  1334. # edge_labels = [item for item in edge_labels if item in self._edge_labels]
  1335. # node_attrs = [item for item in node_attrs if item in self._node_attrs]
  1336. # edge_attrs = [item for item in edge_attrs if item in self._edge_attrs]
  1337. # for g in self._graphs:
  1338. # for nd in g.nodes():
  1339. # for nl in node_labels:
  1340. # del g.nodes[nd][nl]
  1341. # for na in node_attrs:
  1342. # del g.nodes[nd][na]
  1343. # for ed in g.edges():
  1344. # for el in edge_labels:
  1345. # del g.edges[ed][el]
  1346. # for ea in edge_attrs:
  1347. # del g.edges[ed][ea]
  1348. # if len(node_labels) > 0:
  1349. # self._node_labels = [nl for nl in self._node_labels if nl not in node_labels]
  1350. # if len(edge_labels) > 0:
  1351. # self._edge_labels = [el for el in self._edge_labels if el not in edge_labels]
  1352. # if len(node_attrs) > 0:
  1353. # self._node_attrs = [na for na in self._node_attrs if na not in node_attrs]
  1354. # if len(edge_attrs) > 0:
  1355. # self._edge_attrs = [ea for ea in self._edge_attrs if ea not in edge_attrs]
  1356. #
  1357. #
  1358. # def clean_labels(self):
  1359. # labels = []
  1360. # for name in self._node_labels:
  1361. # label = set()
  1362. # for G in self._graphs:
  1363. # label = label | set(nx.get_node_attributes(G, name).values())
  1364. # if len(label) > 1:
  1365. # labels.append(name)
  1366. # break
  1367. # if len(label) < 2:
  1368. # for G in self._graphs:
  1369. # for nd in G.nodes():
  1370. # del G.nodes[nd][name]
  1371. # self._node_labels = labels
  1372. # labels = []
  1373. # for name in self._edge_labels:
  1374. # label = set()
  1375. # for G in self._graphs:
  1376. # label = label | set(nx.get_edge_attributes(G, name).values())
  1377. # if len(label) > 1:
  1378. # labels.append(name)
  1379. # break
  1380. # if len(label) < 2:
  1381. # for G in self._graphs:
  1382. # for ed in G.edges():
  1383. # del G.edges[ed][name]
  1384. # self._edge_labels = labels
  1385. # labels = []
  1386. # for name in self._node_attrs:
  1387. # label = set()
  1388. # for G in self._graphs:
  1389. # label = label | set(nx.get_node_attributes(G, name).values())
  1390. # if len(label) > 1:
  1391. # labels.append(name)
  1392. # break
  1393. # if len(label) < 2:
  1394. # for G in self._graphs:
  1395. # for nd in G.nodes():
  1396. # del G.nodes[nd][name]
  1397. # self._node_attrs = labels
  1398. # labels = []
  1399. # for name in self._edge_attrs:
  1400. # label = set()
  1401. # for G in self._graphs:
  1402. # label = label | set(nx.get_edge_attributes(G, name).values())
  1403. # if len(label) > 1:
  1404. # labels.append(name)
  1405. # break
  1406. # if len(label) < 2:
  1407. # for G in self._graphs:
  1408. # for ed in G.edges():
  1409. # del G.edges[ed][name]
  1410. # self._edge_attrs = labels
  1411. #
  1412. #
  1413. # def cut_graphs(self, range_):
  1414. # self._graphs = [self._graphs[i] for i in range_]
  1415. # if self._targets is not None:
  1416. # self._targets = [self._targets[i] for i in range_]
  1417. # self.clean_labels()
  1418. # def trim_dataset(self, edge_required=False):
  1419. # if edge_required:
  1420. # trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if (nx.number_of_nodes(g) != 0 and nx.number_of_edges(g) != 0)]
  1421. # else:
  1422. # trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if nx.number_of_nodes(g) != 0]
  1423. # idx = [p[0] for p in trimed_pairs]
  1424. # self._graphs = [p[1] for p in trimed_pairs]
  1425. # self._targets = [self._targets[i] for i in idx]
  1426. # self.clean_labels()
  1427. #
  1428. #
  1429. # def copy(self):
  1430. # dataset = Dataset()
  1431. # graphs = [g.copy() for g in self._graphs] if self._graphs is not None else None
  1432. # target = self._targets.copy() if self._targets is not None else None
  1433. # node_labels = self._node_labels.copy() if self._node_labels is not None else None
  1434. # node_attrs = self._node_attrs.copy() if self._node_attrs is not None else None
  1435. # edge_labels = self._edge_labels.copy() if self._edge_labels is not None else None
  1436. # edge_attrs = self._edge_attrs.copy() if self._edge_attrs is not None else None
  1437. # dataset.load_graphs(graphs, target)
  1438. # dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs)
  1439. # # @todo: clean_labels and add other class members?
  1440. # return dataset
  1441. #
  1442. #
  1443. # def get_all_node_labels(self):
  1444. # node_labels = []
  1445. # for g in self._graphs:
  1446. # for n in g.nodes():
  1447. # nl = tuple(g.nodes[n].items())
  1448. # if nl not in node_labels:
  1449. # node_labels.append(nl)
  1450. # return node_labels
  1451. #
  1452. #
  1453. # def get_all_edge_labels(self):
  1454. # edge_labels = []
  1455. # for g in self._graphs:
  1456. # for e in g.edges():
  1457. # el = tuple(g.edges[e].items())
  1458. # if el not in edge_labels:
  1459. # edge_labels.append(el)
  1460. # return edge_labels
  1461. #
  1462. #
  1463. # def _get_dataset_size(self):
  1464. # return len(self._graphs)
  1465. #
  1466. #
  1467. # def _get_all_node_nums(self):
  1468. # return [nx.number_of_nodes(G) for G in self._graphs]
  1469. #
  1470. #
  1471. # def _get_total_node_nums(self, all_node_nums):
  1472. # return np.sum(all_node_nums)
  1473. #
  1474. #
  1475. # def _get_ave_node_num(self, all_node_nums):
  1476. # return np.mean(all_node_nums)
  1477. #
  1478. #
  1479. # def _get_min_node_num(self, all_node_nums):
  1480. # return np.amin(all_node_nums)
  1481. #
  1482. #
  1483. # def _get_max_node_num(self, all_node_nums):
  1484. # return np.amax(all_node_nums)
  1485. #
  1486. #
  1487. # def _get_all_edge_nums(self):
  1488. # return [nx.number_of_edges(G) for G in self._graphs]
  1489. #
  1490. #
  1491. # def _get_total_edge_nums(self, all_edge_nums):
  1492. # return np.sum(all_edge_nums)
  1493. #
  1494. #
  1495. # def _get_ave_edge_num(self, all_edge_nums):
  1496. # return np.mean(all_edge_nums)
  1497. #
  1498. #
  1499. # def _get_min_edge_num(self, all_edge_nums):
  1500. # return np.amin(all_edge_nums)
  1501. #
  1502. #
  1503. # def _get_max_edge_num(self, all_edge_nums):
  1504. # return np.amax(all_edge_nums)
  1505. #
  1506. #
  1507. # def _get_node_label_dim(self):
  1508. # return len(self._node_labels)
  1509. #
  1510. #
  1511. # def _get_node_label_num(self, node_label):
  1512. # nl = set()
  1513. # for G in self._graphs:
  1514. # nl = nl | set(nx.get_node_attributes(G, node_label).values())
  1515. # return len(nl)
  1516. #
  1517. #
  1518. # def _get_edge_label_dim(self):
  1519. # return len(self._edge_labels)
  1520. #
  1521. #
  1522. # def _get_edge_label_num(self, edge_label):
  1523. # el = set()
  1524. # for G in self._graphs:
  1525. # el = el | set(nx.get_edge_attributes(G, edge_label).values())
  1526. # return len(el)
  1527. #
  1528. #
  1529. # def _is_directed(self):
  1530. # return nx.is_directed(self._graphs[0])
  1531. #
  1532. #
  1533. # def _get_all_node_degrees(self):
  1534. # return [np.mean(list(dict(G.degree()).values())) for G in self._graphs]
  1535. #
  1536. #
  1537. # def _get_ave_node_degree(self, all_node_degrees):
  1538. # return np.mean(all_node_degrees)
  1539. #
  1540. #
  1541. # def _get_max_node_degree(self, all_node_degrees):
  1542. # return np.amax(all_node_degrees)
  1543. #
  1544. #
  1545. # def _get_min_node_degree(self, all_node_degrees):
  1546. # return np.amin(all_node_degrees)
  1547. #
  1548. #
  1549. # def _get_all_fill_factors(self):
  1550. # """Get fill factor, the number of non-zero entries in the adjacency matrix.
  1551. # Returns
  1552. # -------
  1553. # list[float]
  1554. # List of fill factors for all graphs.
  1555. # """
  1556. # return [nx.number_of_edges(G) / (nx.number_of_nodes(G) ** 2) for G in self._graphs]
  1557. #
  1558. # def _get_ave_fill_factor(self, all_fill_factors):
  1559. # return np.mean(all_fill_factors)
  1560. #
  1561. #
  1562. # def _get_max_fill_factor(self, all_fill_factors):
  1563. # return np.amax(all_fill_factors)
  1564. #
  1565. #
  1566. # def _get_min_fill_factor(self, all_fill_factors):
  1567. # return np.amin(all_fill_factors)
  1568. #
  1569. #
  1570. # def _get_substructures(self):
  1571. # subs = set()
  1572. # for G in self._graphs:
  1573. # degrees = list(dict(G.degree()).values())
  1574. # if any(i == 2 for i in degrees):
  1575. # subs.add('linear')
  1576. # if np.amax(degrees) >= 3:
  1577. # subs.add('non linear')
  1578. # if 'linear' in subs and 'non linear' in subs:
  1579. # break
  1580. # if self._directed:
  1581. # for G in self._graphs:
  1582. # if len(list(nx.find_cycle(G))) > 0:
  1583. # subs.add('cyclic')
  1584. # break
  1585. # # else:
  1586. # # # @todo: this method does not work for big graph with large amount of edges like D&D, try a better way.
  1587. # # upper = np.amin([nx.number_of_edges(G) for G in Gn]) * 2 + 10
  1588. # # for G in Gn:
  1589. # # if (nx.number_of_edges(G) < upper):
  1590. # # cyc = list(nx.simple_cycles(G.to_directed()))
  1591. # # if any(len(i) > 2 for i in cyc):
  1592. # # subs.add('cyclic')
  1593. # # break
  1594. # # if 'cyclic' not in subs:
  1595. # # for G in Gn:
  1596. # # cyc = list(nx.simple_cycles(G.to_directed()))
  1597. # # if any(len(i) > 2 for i in cyc):
  1598. # # subs.add('cyclic')
  1599. # # break
  1600. #
  1601. # return subs
  1602. #
  1603. #
  1604. # def _get_class_num(self):
  1605. # return len(set(self._targets))
  1606. #
  1607. #
  1608. # def _get_node_attr_dim(self):
  1609. # return len(self._node_attrs)
  1610. #
  1611. #
  1612. # def _get_edge_attr_dim(self):
  1613. # return len(self._edge_attrs)
  1614. #
  1615. # def _compute_all_degree_entropy(self, base=None):
  1616. # """Compute the entropy of degree distribution of each graph.
  1617. # Parameters
  1618. # ----------
  1619. # base : float, optional
  1620. # The logarithmic base to use. The default is ``e`` (natural logarithm).
  1621. # Returns
  1622. # -------
  1623. # degree_entropy : float
  1624. # The calculated entropy.
  1625. # """
  1626. # from gklearn.utils.stats import entropy
  1627. #
  1628. # degree_entropy = []
  1629. # for g in self._graphs:
  1630. # degrees = list(dict(g.degree()).values())
  1631. # en = entropy(degrees, base=base)
  1632. # degree_entropy.append(en)
  1633. # return degree_entropy
  1634. #
  1635. #
  1636. # @property
  1637. # def graphs(self):
  1638. # return self._graphs
  1639. # @property
  1640. # def targets(self):
  1641. # return self._targets
  1642. #
  1643. #
  1644. # @property
  1645. # def node_labels(self):
  1646. # return self._node_labels
  1647. # @property
  1648. # def edge_labels(self):
  1649. # return self._edge_labels
  1650. #
  1651. #
  1652. # @property
  1653. # def node_attrs(self):
  1654. # return self._node_attrs
  1655. #
  1656. #
  1657. # @property
  1658. # def edge_attrs(self):
  1659. # return self._edge_attrs
  1660. #
  1661. #
  1662. # def split_dataset_by_target(dataset):
  1663. # from gklearn.preimage.utils import get_same_item_indices
  1664. #
  1665. # graphs = dataset.graphs
  1666. # targets = dataset.targets
  1667. # datasets = []
  1668. # idx_targets = get_same_item_indices(targets)
  1669. # for key, val in idx_targets.items():
  1670. # sub_graphs = [graphs[i] for i in val]
  1671. # sub_dataset = Dataset()
  1672. # sub_dataset.load_graphs(sub_graphs, [key] * len(val))
  1673. # node_labels = dataset.node_labels.copy() if dataset.node_labels is not None else None
  1674. # node_attrs = dataset.node_attrs.copy() if dataset.node_attrs is not None else None
  1675. # edge_labels = dataset.edge_labels.copy() if dataset.edge_labels is not None else None
  1676. # edge_attrs = dataset.edge_attrs.copy() if dataset.edge_attrs is not None else None
  1677. # sub_dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs)
  1678. # datasets.append(sub_dataset)
  1679. # # @todo: clean_labels?
  1680. # return datasets

A Python package for graph kernels, graph edit distances and graph pre-image problem.