|
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Created on Tue Oct 20 14:25:49 2020
-
- @author:
- Paul Zanoncelli, paul.zanoncelli@ecole.ensicaen.fr
- Luc Brun luc.brun@ensicaen.fr
- Sebastien Bougleux sebastien.bougleux@unicaen.fr
- Benoit Gaüzère benoit.gauzere@insa-rouen.fr
- Linlin Jia linlin.jia@insa-rouen.fr
- """
- import os
- import os.path as osp
- import urllib
- import tarfile
- from zipfile import ZipFile
- # from gklearn.utils.graphfiles import loadDataset
- import torch.nn.functional as F
- import networkx as nx
- import torch
- import random
- import sys
- from lxml import etree
- import re
- from tqdm import tqdm
- from gklearn.dataset import DATABASES, DATASET_META
-
-
- class DataFetcher():
-
- def __init__(self, name=None, root='datasets', reload=False, verbose=False):
- self._name = name
- self._root = root
- if not osp.exists(self._root):
- os.makedirs(self._root)
- self._reload = reload
- self._verbose = verbose
- # self.has_train_valid_test = {
- # "Coil_Del" : ('COIL-DEL/data/test.cxl','COIL-DEL/data/train.cxl','COIL-DEL/data/valid.cxl'),
- # "Coil_Rag" : ('COIL-RAG/data/test.cxl','COIL-RAG/data/train.cxl','COIL-RAG/data/valid.cxl'),
- # "Fingerprint" : ('Fingerprint/data/test.cxl','Fingerprint/data/train.cxl','Fingerprint/data/valid.cxl'),
- # # "Grec" : ('GREC/data/test.cxl','GREC/data/train.cxl','GREC/data/valid.cxl'),
- # "Letter" : {'HIGH' : ('Letter/HIGH/test.cxl','Letter/HIGH/train.cxl','Letter/HIGH/validation.cxl'),
- # 'MED' : ('Letter/MED/test.cxl','Letter/MED/train.cxl','Letter/MED/validation.cxl'),
- # 'LOW' : ('Letter/LOW/test.cxl','Letter/LOW/train.cxl','Letter/LOW/validation.cxl')
- # },
- # "Mutagenicity" : ('Mutagenicity/data/test.cxl','Mutagenicity/data/train.cxl','Mutagenicity/data/validation.cxl'),
- # # "Pah" : ['PAH/testset_0.ds','PAH/trainset_0.ds'],
- # "Protein" : ('Protein/data/test.cxl','Protein/data/train.cxl','Protein/data/valid.cxl'),
- # # "Web" : ('Web/data/test.cxl','Web/data/train.cxl','Web/data/valid.cxl')
- # }
-
- if self._name is None:
- if self._verbose:
- print('No dataset name entered. All possible datasets will be loaded.')
- self._name, self._path = [], []
- for idx, ds_name in enumerate(DATASET_META):
- if self._verbose:
- print(str(idx + 1), '/', str(len(DATASET_META)), 'Fetching', ds_name, end='... ')
- self._name.append(ds_name)
- success = self.write_archive_file(ds_name)
- if success:
- self._path.append(self.open_files(ds_name))
- else:
- self._path.append(None)
- if self._verbose and self._path[-1] is not None and not self._reload:
- print('Fetched.')
-
- if self._verbose:
- print('Finished.', str(sum(v is not None for v in self._path)), 'of', str(len(self._path)), 'datasets are successfully fetched.')
-
- elif self._name not in DATASET_META:
- message = 'Invalid dataset name "' + self._name + '".'
- message += '\nAvailable datasets are as follows: \n\n'
- message += '\n'.join(ds for ds in sorted(DATASET_META))
- message += '\n\nFollowing special suffices can be added to the name:'
- message += '\n\n' + '\n'.join(['_unlabeled'])
- raise ValueError(message)
- else:
- self.write_archive_file(self._name)
- self._path = self.open_files(self._name)
-
- # self.max_for_letter = 0
- # if mode == 'Pytorch':
- # if self._name in self.data_to_use_in_datasets :
- # Gs,y = self.dataset
- # inputs,adjs,y = self.from_networkx_to_pytorch(Gs,y)
- # #print(inputs,adjs)
- # self.pytorch_dataset = inputs,adjs,y
- # elif self._name == "Pah":
- # self.pytorch_dataset = []
- # test,train = self.dataset
- # Gs_test,y_test = test
- # Gs_train,y_train = train
- # self.pytorch_dataset.append(self.from_networkx_to_pytorch(Gs_test,y_test))
- # self.pytorch_dataset.append(self.from_networkx_to_pytorch(Gs_train,y_train))
- # elif self._name in self.has_train_valid_test:
- # self.pytorch_dataset = []
- # #[G[e[0]][e[1]]['bond_type'] for e in G.edges()] for G in Gs])
- # test,train,valid = self.dataset
- # Gs_test,y_test = test
- #
- # Gs_train,y_train = train
- # Gs_valid,y_valid = valid
- # self.pytorch_dataset.append(self.from_networkx_to_pytorch(Gs_test,y_test))
- # self.pytorch_dataset.append(self.from_networkx_to_pytorch(Gs_train,y_train))
- # self.pytorch_dataset.append(self.from_networkx_to_pytorch(Gs_valid,y_valid))
- # #############
- # """
- # for G in Gs :
- # for e in G.edges():
- # print(G[e[0]])
- # """
- # ##############
-
-
- def download_file(self, url):
- try :
- response = urllib.request.urlopen(url)
- except urllib.error.HTTPError:
- print('"', url.split('/')[-1], '" is not available or incorrect http link.')
- return
- except urllib.error.URLError:
- print('Network is unreachable.')
- return
- return response
-
-
- def write_archive_file(self, ds_name):
- path = osp.join(self._root, ds_name)
- # filename_dir = osp.join(path,filename)
- if not osp.exists(path) or self._reload:
- url = DATASET_META[ds_name]['url']
- response = self.download_file(url)
- if response is None:
- return False
- os.makedirs(path, exist_ok=True)
- with open(os.path.join(path, url.split('/')[-1]), 'wb') as outfile:
- outfile.write(response.read())
-
- return True
-
-
- def open_files(self, ds_name=None):
- if ds_name is None:
- ds_name = (self._name if isinstance(self._name, str) else self._name[0])
- filename = DATASET_META[ds_name]['url'].split('/')[-1]
- path = osp.join(self._root, ds_name)
- filename_archive = osp.join(path, filename)
-
- if filename.endswith('gz'):
- if tarfile.is_tarfile(filename_archive):
- with tarfile.open(filename_archive, 'r:gz') as tar:
- if self._reload and self._verbose:
- print(filename + ' Downloaded.')
- subpath = os.path.join(path, tar.getnames()[0].split('/')[0])
- if not osp.exists(subpath) or self._reload:
- tar.extractall(path = path)
- return subpath
- elif filename.endswith('.tar'):
- if tarfile.is_tarfile(filename_archive):
- with tarfile.open(filename_archive, 'r:') as tar:
- if self._reload and self._verbose:
- print(filename + ' Downloaded.')
- subpath = os.path.join(path, tar.getnames()[0])
- if not osp.exists(subpath) or self._reload:
- tar.extractall(path = path)
- return subpath
- elif filename.endswith('.zip'):
- with ZipFile(filename_archive, 'r') as zip_ref:
- if self._reload and self._verbose:
- print(filename + ' Downloaded.')
- subpath = os.path.join(path, zip_ref.namelist()[0])
- if not osp.exists(subpath) or self._reload:
- zip_ref.extractall(path)
- return subpath
- else:
- raise ValueError(filename + ' Unsupported file.')
-
-
- def get_all_ds_infos(self, database):
- """Get information of all datasets from a database.
-
- Parameters
- ----------
- database : string
- DESCRIPTION.
-
- Returns
- -------
- None.
- """
- if database.lower() == 'tudataset':
- infos = self.get_all_tud_ds_infos()
- elif database.lower() == 'iam':
- pass
- else:
- msg = 'Invalid Database name "' + database + '"'
- msg += '\n Available databases are as follows: \n\n'
- msg += '\n'.join(db for db in sorted(DATABASES))
- msg += 'Check "gklearn.dataset.DATASET_META" for more details.'
- raise ValueError(msg)
-
- return infos
-
-
- def get_all_tud_ds_infos(self):
- """Get information of all datasets from database TUDataset.
-
- Returns
- -------
- None.
- """
- try:
- response = urllib.request.urlopen(DATABASES['tudataset'])
- except urllib.error.HTTPError:
- print('The URL of the database "TUDataset" is not available:\n' + DATABASES['tudataset'])
-
- infos = {}
-
- # Get tables.
- h_str = response.read()
- tree = etree.HTML(h_str)
- tables = tree.xpath('//table')
- for table in tables:
- # Get the domain of the datasets.
- h2_nodes = table.getprevious()
- if h2_nodes is not None and h2_nodes.tag == 'h2':
- domain = h2_nodes.text.strip().lower()
- else:
- domain = ''
-
- # Get each line in the table.
- tr_nodes = table.xpath('tbody/tr')
- for tr in tr_nodes[1:]:
- # Get each element in the line.
- td_node = tr.xpath('td')
-
- # task type.
- cls_txt = td_node[3].text.strip()
- if not cls_txt.startswith('R'):
- class_number = int(cls_txt)
- task_type = 'classification'
- else:
- class_number = None
- task_type = 'regression'
-
- # node attrs.
- na_text = td_node[8].text.strip()
- if not na_text.startswith('+'):
- node_attr_dim = 0
- else:
- node_attr_dim = int(re.findall('\((.*)\)', na_text)[0])
-
- # edge attrs.
- ea_text = td_node[10].text.strip()
- if ea_text == 'temporal':
- edge_attr_dim = ea_text
- elif not ea_text.startswith('+'):
- edge_attr_dim = 0
- else:
- edge_attr_dim = int(re.findall('\((.*)\)', ea_text)[0])
-
- # geometry.
- geo_txt = td_node[9].text.strip()
- if geo_txt == '–':
- geometry = None
- else:
- geometry = geo_txt
-
- # url.
- url = td_node[11].xpath('a')[0].attrib['href'].strip()
- pos_zip = url.rfind('.zip')
- url = url[:pos_zip + 4]
-
- infos[td_node[0].xpath('strong')[0].text.strip()] = {
- 'database': 'tudataset',
- 'reference': td_node[1].text.strip(),
- 'dataset_size': int(td_node[2].text.strip()),
- 'class_number': class_number,
- 'task_type': task_type,
- 'ave_node_num': float(td_node[4].text.strip()),
- 'ave_edge_num': float(td_node[5].text.strip()),
- 'node_labeled': True if td_node[6].text.strip() == '+' else False,
- 'edge_labeled': True if td_node[7].text.strip() == '+' else False,
- 'node_attr_dim': node_attr_dim,
- 'geometry': geometry,
- 'edge_attr_dim': edge_attr_dim,
- 'url': url,
- 'domain': domain
- }
-
- return infos
-
-
- def pretty_ds_infos(self, infos):
- """Get the string that pretty prints the information of datasets.
-
- Parameters
- ----------
- datasets : dict
- The datasets' information.
-
- Returns
- -------
- p_str : string
- The pretty print of the datasets' information.
- """
- p_str = '{\n'
- for key, val in infos.items():
- p_str += '\t\'' + str(key) + '\': {\n'
- for k, v in val.items():
- p_str += '\t\t\'' + str(k) + '\': '
- if isinstance(v, str):
- p_str += '\'' + str(v) + '\',\n'
- else:
- p_str += '' + str(v) + ',\n'
- p_str += '\t},\n'
- p_str += '}'
-
- return p_str
-
-
- @property
- def path(self):
- return self._path
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- def dataset(self):
- if self.mode == "Tensorflow":
- return #something
- if self.mode == "Pytorch":
- return self.pytorch_dataset
- return self.dataset
-
-
- def info(self):
- print(self.info_dataset[self._name])
-
-
- def iter_load_dataset(self,data):
- results = []
- for datasets in data :
- results.append(loadDataset(osp.join(self._root,self._name,datasets)))
- return results
-
-
- def load_dataset(self,list_files):
- if self._name == "Ptc":
- if type(self.option) != str or self.option.upper() not in ['FR','FM','MM','MR']:
- raise ValueError('option for Ptc dataset needs to be one of : \n fr fm mm mr')
- results = []
- results.append(loadDataset(osp.join(self.root,self._name,'PTC/Test',self.gender + '.ds')))
- results.append(loadDataset(osp.join(self.root,self._name,'PTC/Train',self.gender + '.ds')))
- return results
- if self.name == "Pah":
- maximum_sets = 0
- for file in list_files:
- if file.endswith('ds'):
- maximum_sets = max(maximum_sets,int(file.split('_')[1].split('.')[0]))
- self.max_for_letter = maximum_sets
- if not type(self.option) == int or self.option > maximum_sets or self.option < 0:
- raise ValueError('option needs to be an integer between 0 and ' + str(maximum_sets))
- data = self.has_train_valid_test["Pah"]
- data[0] = self.has_train_valid_test["Pah"][0].split('_')[0] + '_' + str(self.option) + '.ds'
- data[1] = self.has_train_valid_test["Pah"][1].split('_')[0] + '_' + str(self.option) + '.ds'
- return self.iter_load_dataset(data)
- if self.name == "Letter":
- if type(self.option) == str and self.option.upper() in self.has_train_valid_test["Letter"]:
- data = self.has_train_valid_test["Letter"][self.option.upper()]
- else:
- message = "The parameter for letter is incorrect choose between : "
- message += "\nhigh med low"
- raise ValueError(message)
- return self.iter_load_dataset(data)
- if self.name in self.has_train_valid_test : #common IAM dataset with train, valid and test
- data = self.has_train_valid_test[self.name]
- return self.iter_load_dataset(data)
- else: #common dataset without train,valid and test, only dataset.ds file
- data = self.data_to_use_in_datasets[self.name]
- if len(data) > 1 and data[0] in list_files and data[1] in list_files: #case for Alkane
- return loadDataset(osp.join(self.root,self.name,data[0]),filename_y = osp.join(self.root,self.name,data[1]))
- if data in list_files:
- return loadDataset(osp.join(self.root,self.name,data))
-
-
- def build_dictionary(self,Gs):
- labels = set()
- #next line : from DeepGraphWithNNTorch
- #bond_type_number_maxi = int(max(max([[G[e[0]][e[1]]['bond_type'] for e in G.edges()] for G in Gs])))
- sizes = set()
- for G in Gs :
- for _,node in G.nodes(data = True): # or for node in nx.nodes(G)
- #print(_,node)
- labels.add(node["label"][0]) # labels.add(G.nodes[node]["label"][0]) #what do we use for IAM datasets (they don't have bond_type or event label) ?
- sizes.add(G.order())
- label_dict = {}
- #print("labels : ", labels, bond_type_number_maxi)
- for i,label in enumerate(labels):
- label_dict[label] = [0.]*len(labels)
- label_dict[label][i] = 1.
- return label_dict
-
-
- def from_networkx_to_pytorch(self,Gs,y):
- #exemple for MAO: atom_to_onehot = {'C': [1., 0., 0.], 'N': [0., 1., 0.], 'O': [0., 0., 1.]}
- # code from https://github.com/bgauzere/pygnn/blob/master/utils.py
- atom_to_onehot = self.build_dictionary(Gs)
- max_size = 30
- adjs = []
- inputs = []
- for i, G in enumerate(Gs):
- I = torch.eye(G.order(), G.order())
- #A = torch.Tensor(nx.adjacency_matrix(G).todense())
- #A = torch.Tensor(nx.to_numpy_matrix(G))
- A = torch.tensor(nx.to_scipy_sparse_matrix(G,dtype = int,weight = 'bond_type').todense(),dtype = torch.int) #what do we use for IAM datasets (they don't have bond_type or event label) ?
- adj = F.pad(A, pad=(0, max_size-G.order(), 0, max_size-G.order())) #add I now ? if yes : F.pad(A + I,pad = (...))
- adjs.append(adj)
-
- f_0 = []
- for _, label in G.nodes(data=True):
- #print(_,label)
- cur_label = atom_to_onehot[label['label'][0]].copy()
- f_0.append(cur_label)
-
- X = F.pad(torch.Tensor(f_0), pad=(0, 0, 0, max_size-G.order()))
- inputs.append(X)
- return inputs,adjs,y
-
-
- def from_pytorch_to_tensorflow(self,batch_size):
- seed = random.randrange(sys.maxsize)
- random.seed(seed)
- tf_inputs = random.sample(self.pytorch_dataset[0],batch_size)
- random.seed(seed)
- tf_y = random.sample(self.pytorch_dataset[2],batch_size)
-
-
- def from_networkx_to_tensor(self,G,dict):
- A=nx.to_numpy_matrix(G)
- lab=[dict[G.nodes[v]['label'][0]] for v in nx.nodes(G)]
- return (torch.tensor(A).view(1,A.shape[0]*A.shape[1]),torch.tensor(lab))
-
-
-
-
- #dataset= selfopen_files()
- #print(build_dictionary(Gs))
- #dic={'C':0,'N':1,'O':2}
- #A,labels=from_networkx_to_tensor(Gs[13],dic)
- #print(nx.to_numpy_matrix(Gs[13]),labels)
- #print(A,labels)
-
- #@todo : from_networkx_to_tensorflow
-
-
- # dataloader = DataLoader('Acyclic',root = "database",option = 'high',mode = "Pytorch")
- # dataloader.info()
- # inputs,adjs,y = dataloader.pytorch_dataset
-
- # """
- # test,train,valid = dataloader.dataset
- # Gs,y = test
- # Gs2,y2 = train
- # Gs3,y3 = valid
- # """
- # #Gs,y = dataloader.
- # #print(Gs,y)
- # """
- # Gs,y = dataloader.dataset
- # for G in Gs :
- # for e in G.edges():
- # print(G[e[0]])
-
- # """
-
- # #for e in Gs[13].edges():
- # # print(Gs[13][e[0]])
-
- # #print(from_networkx_to_tensor(Gs[7],{'C': [1., 0., 0.], 'N': [0., 1., 0.], 'O': [0., 0., 1.]}))
-
- # #dataset.open_files()
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- # import os
- # import os.path as osp
- # import urllib
- # import tarfile
- # from zipfile import ZipFile
- # from gklearn.utils.graphfiles import loadDataset
- # import torch
- # import torch.nn.functional as F
- # import networkx as nx
- # import matplotlib.pyplot as plt
- # import numpy as np
-
-
- #
- # def DataLoader(name,root = 'data',mode = "Networkx",downloadAll = False,reload = False,letter = "High",number = 0,gender = "MM"):
- # dir_name = "_".join(name.split("-"))
- # if not osp.exists(root) :
- # os.makedirs(root)
- # url = "https://brunl01.users.greyc.fr/CHEMISTRY/"
- # urliam = "https://iapr-tc15.greyc.fr/IAM/"
- # list_database = {
- # "Ace" : (url,"ACEDataset.tar"),
- # "Acyclic" : (url,"Acyclic.tar.gz"),
- # "Aids" : (urliam,"AIDS.zip"),
- # "Alkane" : (url,"alkane_dataset.tar.gz"),
- # "Chiral" : (url,"DatasetAcyclicChiral.tar"),
- # "Coil_Del" : (urliam,"COIL-DEL.zip"),
- # "Coil_Rag" : (urliam,"COIL-RAG.zip"),
- # "Fingerprint" : (urliam,"Fingerprint.zip"),
- # "Grec" : (urliam,"GREC.zip"),
- # "Letter" : (urliam,"Letter.zip"),
- # "Mao" : (url,"mao.tgz"),
- # "Monoterpenoides" : (url,"monoterpenoides.tar.gz"),
- # "Mutagenicity" : (urliam,"Mutagenicity.zip"),
- # "Pah" : (url,"PAH.tar.gz"),
- # "Protein" : (urliam,"Protein.zip"),
- # "Ptc" : (url,"ptc.tgz"),
- # "Steroid" : (url,"SteroidDataset.tar"),
- # "Vitamin" : (url,"DatasetVitamin.tar"),
- # "Web" : (urliam,"Web.zip")
- # }
- #
- # data_to_use_in_datasets = {
- # "Acyclic" : ("Acyclic/dataset_bps.ds"),
- # "Aids" : ("AIDS_A.txt"),
- # "Alkane" : ("Alkane/dataset.ds","Alkane/dataset_boiling_point_names.txt"),
- # "Mao" : ("MAO/dataset.ds"),
- # "Monoterpenoides" : ("monoterpenoides/dataset_10+.ds"), #('monoterpenoides/dataset.ds'),('monoterpenoides/dataset_9.ds'),('monoterpenoides/trainset_9.ds')
- #
- # }
- # has_train_valid_test = {
- # "Coil_Del" : ('COIL-DEL/data/test.cxl','COIL-DEL/data/train.cxl','COIL-DEL/data/valid.cxl'),
- # "Coil_Rag" : ('COIL-RAG/data/test.cxl','COIL-RAG/data/train.cxl','COIL-RAG/data/valid.cxl'),
- # "Fingerprint" : ('Fingerprint/data/test.cxl','Fingerprint/data/train.cxl','Fingerprint/data/valid.cxl'),
- # "Grec" : ('GREC/data/test.cxl','GREC/data/train.cxl','GREC/data/valid.cxl'),
- # "Letter" : {'HIGH' : ('Letter/HIGH/test.cxl','Letter/HIGH/train.cxl','Letter/HIGH/validation.cxl'),
- # 'MED' : ('Letter/MED/test.cxl','Letter/MED/train.cxl','Letter/MED/validation.cxl'),
- # 'LOW' : ('Letter/LOW/test.cxl','Letter/LOW/train.cxl','Letter/LOW/validation.cxl')
- # },
- # "Mutagenicity" : ('Mutagenicity/data/test.cxl','Mutagenicity/data/train.cxl','Mutagenicity/data/validation.cxl'),
- # "Pah" : ['PAH/testset_0.ds','PAH/trainset_0.ds'],
- # "Protein" : ('Protein/data/test.cxl','Protein/data/train.cxl','Protein/data/valid.cxl'),
- # "Web" : ('Web/data/test.cxl','Web/data/train.cxl','Web/data/valid.cxl')
- # }
- #
- # if not name :
- # raise ValueError("No dataset entered")
- # if name not in list_database:
- # message = "Invalid Dataset name " + name
- # message += '\n Available datasets are as follows : \n\n'
-
- # message += '\n'.join(database for database in list_database)
- # raise ValueError(message)
- #
- # def download_file(url,filename):
- # try :
- # response = urllib.request.urlopen(url + filename)
- # except urllib.error.HTTPError:
- # print(filename + " not available or incorrect http link")
- # return
- # return response
- #
- # def write_archive_file(root,database):
- # path = osp.join(root,database)
- # url,filename = list_database[database]
- # filename_dir = osp.join(path,filename)
- # if not osp.exists(filename_dir) or reload:
- # response = download_file(url,filename)
- # if response is None :
- # return
- # if not osp.exists(path) :
- # os.makedirs(path)
- # with open(filename_dir,'wb') as outfile :
- # outfile.write(response.read())
- #
- # if downloadAll :
- # print('Waiting...')
- # for database in list_database :
- # write_archive_file(root,database)
- # print('Downloading finished')
- # else:
- # write_archive_file(root,name)
- #
- # def iter_load_dataset(data):
- # results = []
- # for datasets in data :
- # results.append(loadDataset(osp.join(root,name,datasets)))
- # return results
- #
- # def load_dataset(list_files):
- # if name == "Ptc":
- # if gender.upper() not in ['FR','FM','MM','MR']:
- # raise ValueError('gender chosen needs to be one of \n fr fm mm mr')
- # results = []
- # results.append(loadDataset(osp.join(root,name,'PTC/Test',gender.upper() + '.ds')))
- # results.append(loadDataset(osp.join(root,name,'PTC/Train',gender.upper() + '.ds')))
- # return results
- # if name == "Pah":
- # maximum_sets = 0
- # for file in list_files:
- # if file.endswith('ds'):
- # maximum_sets = max(maximum_sets,int(file.split('_')[1].split('.')[0]))
- # if number > maximum_sets :
- # raise ValueError("Please select a dataset with number less than " + str(maximum_sets + 1))
- # data = has_train_valid_test["Pah"]
- # data[0] = has_train_valid_test["Pah"][0].split('_')[0] + '_' + str(number) + '.ds'
- # data[1] = has_train_valid_test["Pah"][1].split('_')[0] + '_' + str(number) + '.ds'
- # #print(data)
- # return iter_load_dataset(data)
- # if name == "Letter":
- # if letter.upper() in has_train_valid_test["Letter"]:
- # data = has_train_valid_test["Letter"][letter.upper()]
- # else:
- # message = "The parameter for letter is incorrect choose between : "
- # message += "\nhigh med low"
- # raise ValueError(message)
- # results = []
- # for datasets in data:
- # results.append(loadDataset(osp.join(root,name,datasets)))
- # return results
- # if name in has_train_valid_test : #common IAM dataset with train, valid and test
- # data = has_train_valid_test[name]
- # results = []
- # for datasets in data :
- # results.append(loadDataset(osp.join(root,name,datasets)))
- # return results
- # else: #common dataset without train,valid and test, only dataset.ds file
- # data = data_to_use_in_datasets[name]
- # if len(data) > 1 and data[0] in list_files and data[1] in list_files:
- # return loadDataset(osp.join(root,name,data[0]),filename_y = osp.join(root,name,data[1]))
- # if data in list_files:
- # return loadDataset(osp.join(root,name,data))
-
- # def open_files():
- # filename = list_database[name][1]
- # path = osp.join(root,name)
- # filename_archive = osp.join(root,name,filename)
- #
- # if filename.endswith('gz'):
- # if tarfile.is_tarfile(filename_archive):
- # with tarfile.open(filename_archive,"r:gz") as tar:
- # if reload:
- # print(filename + " Downloaded")
- # tar.extractall(path = path)
- # return load_dataset(tar.getnames())
- # #raise ValueError("dataset not available")
- #
- #
- # elif filename.endswith('.tar'):
- # if tarfile.is_tarfile(filename_archive):
- # with tarfile.open(filename_archive,"r:") as tar:
- # if reload :
- # print(filename + " Downloaded")
- # tar.extractall(path = path)
- # return load_dataset(tar.getnames())
- # elif filename.endswith('.zip'):
- # with ZipFile(filename_archive,"r") as zip_ref:
- # if reload :
- # print(filename + " Downloaded")
- # zip_ref.extractall(path)
- # return load_dataset(zip_ref.namelist())
- # else:
- # print(filename + " Unsupported file")
- # """
- # with tarfile.open(osp.join(root,name,list_database[name][1]),"r:gz") as files:
- # for file in files.getnames():
- # print(file)
- # """
- #
- # def build_dictionary(Gs):
- # labels = set()
- # bond_type_number_maxi = int(max(max([[G[e[0]][e[1]]['bond_type'] for e in G.edges()] for G in Gs])))
- # print(bond_type_number_maxi)
- # sizes = set()
- # for G in Gs :
- # for _,node in G.nodes(data = True): # or for node in nx.nodes(G)
- # #print(node)
- # labels.add(node["label"][0]) # labels.add(G.nodes[node]["label"][0])
- # sizes.add(G.order())
- # if len(labels) >= bond_type_number_maxi:
- # break
- # label_dict = {}
- # for i,label in enumerate(labels):
- # label_dict[label] = [0.]*bond_type_number_maxi
- # label_dict[label][i] = 1.
- # return label_dict
- #
- # def from_networkx_to_pytorch(Gs):
- # #exemple : atom_to_onehot = {'C': [1., 0., 0.], 'N': [0., 1., 0.], 'O': [0., 0., 1.]}
- # # code from https://github.com/bgauzere/pygnn/blob/master/utils.py
- # atom_to_onehot = build_dictionary(Gs)
- # max_size = 30
- # adjs = []
- # inputs = []
- # for i, G in enumerate(Gs):
- # I = torch.eye(G.order(), G.order())
- # A = torch.Tensor(nx.adjacency_matrix(G).todense())
- # A = torch.tensor(nx.to_scipy_sparse_matrix(G,dtype = int,weight = 'bond_type').todense(),dtype = torch.int)
- # adj = F.pad(A+I, pad=(0, max_size-G.order(), 0, max_size-G.order())) #add I now ?
- # adjs.append(adj)
-
- # f_0 = []
- # for _, label in G.nodes(data=True):
- # #print(_,label)
- # cur_label = atom_to_onehot[label['label'][0]].copy()
- # f_0.append(cur_label)
-
- # X = F.pad(torch.Tensor(f_0), pad=(0, 0, 0, max_size-G.order()))
- # inputs.append(X)
- # return inputs,adjs,y
- #
- # def from_networkx_to_tensor(G,dict):
-
- # A=nx.to_numpy_matrix(G)
- # lab=[dict[G.nodes[v]['label'][0]] for v in nx.nodes(G)]
- # return (torch.tensor(A).view(1,A.shape[0]*A.shape[1]),torch.tensor(lab))
- #
- # dataset= open_files()
- # #print(build_dictionary(Gs))
- # #dic={'C':0,'N':1,'O':2}
- # #A,labels=from_networkx_to_tensor(Gs[13],dic)
- # #print(nx.to_numpy_matrix(Gs[13]),labels)
- # #print(A,labels)
- #
- # """
- # for G in Gs :
- # for node in nx.nodes(G):
- # print(G.nodes[node])
- # """
- # if mode == "pytorch":
- # Gs,y = dataset
- # inputs,adjs,y = from_networkx_to_pytorch(Gs)
- # print(inputs,adjs)
- # return inputs,adjs,y
- #
- #
- # """
- # dic = dict()
- # for i,l in enumerate(label):
- # dic[l] = i
- # dic = {'C': 0, 'N': 1, 'O': 2}
- # A,labels=from_networkx_to_tensor(Gs[0],dic)
- # #print(A,labels)
- # return A,labels
- # """
- #
- # return dataset
- #
- # #open_files()
- #
-
- # def label_to_color(label):
- # if label == 'C':
- # return 0.1
- # elif label == 'O':
- # return 0.8
- #
- # def nodes_to_color_sequence(G):
- # return [label_to_color(c[1]['label'][0]) for c in G.nodes(data=True)]
-
-
- # ##############
- # """
- # dataset = DataLoader('Mao',root = "database")
- # print(dataset)
- # Gs,y = dataset
- # """
-
- # """
- # dataset = DataLoader('Alkane',root = "database") # Gs is empty here whereas y isn't -> not working
- # Gs,y = dataset
- # """
-
- # """
- # dataset = DataLoader('Acyclic', root = "database")
- # Gs,y = dataset
- # """
-
- # """
- # dataset = DataLoader('Monoterpenoides', root = "database")
- # Gs,y = dataset
- # """
-
- # """
- # dataset = DataLoader('Pah',root = 'database', number = 8)
- # test_set,train_set = dataset
- # Gs,y = test_set
- # Gs2,y2 = train_set
- # """
-
- # """
- # dataset = DataLoader('Coil_Del',root = "database")
- # test,train,valid = dataset
- # Gs,y = test
- # Gs2,y2 = train
- # Gs3, y3 = valid
- # """
-
- # """
- # dataset = DataLoader('Coil_Rag',root = "database")
- # test,train,valid = dataset
- # Gs,y = test
- # Gs2,y2 = train
- # Gs3, y3 = valid
- # """
-
- # """
- # dataset = DataLoader('Fingerprint',root = "database")
- # test,train,valid = dataset
- # Gs,y = test
- # Gs2,y2 = train
- # Gs3, y3 = valid
- # """
-
- # """
- # dataset = DataLoader('Grec',root = "database")
- # test,train,valid = dataset
- # Gs,y = test
- # Gs2,y2 = train
- # Gs3, y3 = valid
- # """
-
- # """
- # dataset = DataLoader('Letter',root = "database",letter = 'low') #high low med
- # test,train,valid = dataset
- # Gs,y = test
- # Gs2,y2 = train
- # Gs3, y3 = valid
- # """
-
- # """
- # dataset = DataLoader('Mutagenicity',root = "database")
- # test,train,valid = dataset
- # Gs,y = test
- # Gs2,y2 = train
- # Gs3, y3 = valid
- # """
- # """
- # dataset = DataLoader('Protein',root = "database")
- # test,train,valid = dataset
- # Gs,y = test
- # Gs2,y2 = train
- # Gs3, y3 = valid
- # """
-
-
- # """
- # dataset = DataLoader('Ptc', root = "database",gender = 'fm') # not working, Gs and y are empty perhaps issue coming from loadDataset
- # valid,train = dataset
- # Gs,y = valid
- # Gs2,y2 = train
- # """
-
- # """
- # dataset = DataLoader('Web', root = "database")
- # test,train,valid = dataset
- # Gs,y = test
- # Gs2,y2 = train
- # Gs3,y3 = valid
- # """
- # print(Gs,y)
- # print(len(dataset))
- # ##############
- # #print('edge max label',max(max([[G[e[0]][e[1]]['bond_type'] for e in G.edges()] for G in Gs])))
- # G1 = Gs[13]
- # G2 = Gs[23]
- # """
- # nx.draw_networkx(G1,with_labels=True,node_color = nodes_to_color_sequence(G1),cmap='autumn')
- # plt.figure()
-
- # nx.draw_networkx(G2,with_labels=True,node_color = nodes_to_color_sequence(G2),cmap='autumn')
- # """
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- # from pathlib import Path
-
- # DATA_PATH = Path("data")
-
- # def import_datasets():
- #
- # import urllib
- # import tarfile
- # from zipfile import ZipFile
-
- # URL = "https://brunl01.users.greyc.fr/CHEMISTRY/"
- # URLIAM = "https://iapr-tc15.greyc.fr/IAM/"
- #
-
- # LIST_DATABASE = {
- # "Pah" : (URL,"PAH.tar.gz"),
- # "Mao" : (URL,"mao.tgz"),
- # "Ptc" : (URL,"ptc.tgz"),
- # "Aids" : (URLIAM,"AIDS.zip"),
- # "Acyclic" : (URL,"Acyclic.tar.gz"),
- # "Alkane" : (URL,"alkane_dataset.tar.gz"),
- # "Chiral" : (URL,"DatasetAcyclicChiral.tar"),
- # "Vitamin" : (URL,"DatasetVitamin.tar"),
- # "Ace" : (URL,"ACEDataset.tar"),
- # "Steroid" : (URL,"SteroidDataset.tar"),
- # "Monoterpenoides" : (URL,"monoterpenoides.tar.gz"),
- # "Letter" : (URLIAM,"Letter.zip"),
- # "Grec" : (URLIAM,"GREC.zip"),
- # "Fingerprint" : (URLIAM,"Fingerprint.zip"),
- # "Coil_Rag" : (URLIAM,"COIL-RAG.zip"),
- # "Coil_Del" : (URLIAM,"COIL-DEL.zip"),
- # "Web" : (URLIAM,"Web.zip"),
- # "Mutagenicity" : (URLIAM,"Mutagenicity.zip"),
- # "Protein" : (URLIAM,"Protein.zip")
- # }
- # print("Select databases in the list. Select multiple, split by white spaces .\nWrite All to select all of them.\n")
- # print(', '.join(database for database in LIST_DATABASE))
-
- # print("Choice : ",end = ' ')
- # selected_databases = input().split()
-
- #
- # def download_file(url,filename):
- # try :
- # response = urllib.request.urlopen(url + filename)
- # except urllib.error.HTTPError:
- # print(filename + " not available or incorrect http link")
- # return
- # return response
- #
- # def write_archive_file(database):
- #
- # PATH = DATA_PATH / database
- # url,filename = LIST_DATABASE[database]
- # if not (PATH / filename).exists():
- # response = download_file(url,filename)
- # if response is None :
- # return
- # if not PATH.exists() :
- # PATH.mkdir(parents=True, exist_ok=True)
- # with open(PATH/filename,'wb') as outfile :
- # outfile.write(response.read())
- #
- # if filename[-2:] == 'gz':
- # if tarfile.is_tarfile(PATH/filename):
- # with tarfile.open(PATH/filename,"r:gz") as tar:
- # tar.extractall(path = PATH)
- # print(filename + ' Downloaded')
- # elif filename[-3:] == 'tar':
- # if tarfile.is_tarfile(PATH/filename):
- # with tarfile.open(PATH/filename,"r:") as tar:
- # tar.extractall(path = PATH)
- # print(filename + ' Downloaded')
- # elif filename[-3:] == 'zip':
- # with ZipFile(PATH/filename,"r") as zip_ref:
- # zip_ref.extractall(PATH)
- # print(filename + ' Downloaded')
- # else:
- # print("Unsupported file")
-
- # if 'All' in selected_databases:
- # print('Waiting...')
- # for database in LIST_DATABASE :
- # write_archive_file(database)
- # print('Finished')
- # else:
- # print('Waiting...')
- # for database in selected_databases :
- # if database in LIST_DATABASE :
- # write_archive_file(database)
- # print('Finished')
- # import_datasets()
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- # class GraphFetcher(object):
- #
- #
- # def __init__(self, filename=None, filename_targets=None, **kwargs):
- # if filename is None:
- # self._graphs = None
- # self._targets = None
- # self._node_labels = None
- # self._edge_labels = None
- # self._node_attrs = None
- # self._edge_attrs = None
- # else:
- # self.load_dataset(filename, filename_targets=filename_targets, **kwargs)
- #
- # self._substructures = None
- # self._node_label_dim = None
- # self._edge_label_dim = None
- # self._directed = None
- # self._dataset_size = None
- # self._total_node_num = None
- # self._ave_node_num = None
- # self._min_node_num = None
- # self._max_node_num = None
- # self._total_edge_num = None
- # self._ave_edge_num = None
- # self._min_edge_num = None
- # self._max_edge_num = None
- # self._ave_node_degree = None
- # self._min_node_degree = None
- # self._max_node_degree = None
- # self._ave_fill_factor = None
- # self._min_fill_factor = None
- # self._max_fill_factor = None
- # self._node_label_nums = None
- # self._edge_label_nums = None
- # self._node_attr_dim = None
- # self._edge_attr_dim = None
- # self._class_number = None
- #
- #
- # def load_dataset(self, filename, filename_targets=None, **kwargs):
- # self._graphs, self._targets, label_names = load_dataset(filename, filename_targets=filename_targets, **kwargs)
- # self._node_labels = label_names['node_labels']
- # self._node_attrs = label_names['node_attrs']
- # self._edge_labels = label_names['edge_labels']
- # self._edge_attrs = label_names['edge_attrs']
- # self.clean_labels()
- #
- #
- # def load_graphs(self, graphs, targets=None):
- # # this has to be followed by set_labels().
- # self._graphs = graphs
- # self._targets = targets
- # # self.set_labels_attrs() # @todo
- #
- #
- # def load_predefined_dataset(self, ds_name):
- # current_path = os.path.dirname(os.path.realpath(__file__)) + '/'
- # if ds_name == 'Acyclic':
- # ds_file = current_path + '../../datasets/Acyclic/dataset_bps.ds'
- # self._graphs, self._targets, label_names = load_dataset(ds_file)
- # elif ds_name == 'AIDS':
- # ds_file = current_path + '../../datasets/AIDS/AIDS_A.txt'
- # self._graphs, self._targets, label_names = load_dataset(ds_file)
- # elif ds_name == 'Alkane':
- # ds_file = current_path + '../../datasets/Alkane/dataset.ds'
- # fn_targets = current_path + '../../datasets/Alkane/dataset_boiling_point_names.txt'
- # self._graphs, self._targets, label_names = load_dataset(ds_file, filename_targets=fn_targets)
- # elif ds_name == 'COIL-DEL':
- # ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt'
- # self._graphs, self._targets, label_names = load_dataset(ds_file)
- # elif ds_name == 'COIL-RAG':
- # ds_file = current_path + '../../datasets/COIL-RAG/COIL-RAG_A.txt'
- # self._graphs, self._targets, label_names = load_dataset(ds_file)
- # elif ds_name == 'COLORS-3':
- # ds_file = current_path + '../../datasets/COLORS-3/COLORS-3_A.txt'
- # self._graphs, self._targets, label_names = load_dataset(ds_file)
- # elif ds_name == 'Cuneiform':
- # ds_file = current_path + '../../datasets/Cuneiform/Cuneiform_A.txt'
- # self._graphs, self._targets, label_names = load_dataset(ds_file)
- # elif ds_name == 'DD':
- # ds_file = current_path + '../../datasets/DD/DD_A.txt'
- # self._graphs, self._targets, label_names = load_dataset(ds_file)
- # elif ds_name == 'ENZYMES':
- # ds_file = current_path + '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'
- # self._graphs, self._targets, label_names = load_dataset(ds_file)
- # elif ds_name == 'Fingerprint':
- # ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt'
- # self._graphs, self._targets, label_names = load_dataset(ds_file)
- # elif ds_name == 'FRANKENSTEIN':
- # ds_file = current_path + '../../datasets/FRANKENSTEIN/FRANKENSTEIN_A.txt'
- # self._graphs, self._targets, label_names = load_dataset(ds_file)
- # elif ds_name == 'Letter-high': # node non-symb
- # ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt'
- # self._graphs, self._targets, label_names = load_dataset(ds_file)
- # elif ds_name == 'Letter-low': # node non-symb
- # ds_file = current_path + '../../datasets/Letter-low/Letter-low_A.txt'
- # self._graphs, self._targets, label_names = load_dataset(ds_file)
- # elif ds_name == 'Letter-med': # node non-symb
- # ds_file = current_path + '../../datasets/Letter-med/Letter-med_A.txt'
- # self._graphs, self._targets, label_names = load_dataset(ds_file)
- # elif ds_name == 'MAO':
- # ds_file = current_path + '../../datasets/MAO/dataset.ds'
- # self._graphs, self._targets, label_names = load_dataset(ds_file)
- # elif ds_name == 'Monoterpenoides':
- # ds_file = current_path + '../../datasets/Monoterpenoides/dataset_10+.ds'
- # self._graphs, self._targets, label_names = load_dataset(ds_file)
- # elif ds_name == 'MUTAG':
- # ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt'
- # self._graphs, self._targets, label_names = load_dataset(ds_file)
- # elif ds_name == 'NCI1':
- # ds_file = current_path + '../../datasets/NCI1/NCI1_A.txt'
- # self._graphs, self._targets, label_names = load_dataset(ds_file)
- # elif ds_name == 'NCI109':
- # ds_file = current_path + '../../datasets/NCI109/NCI109_A.txt'
- # self._graphs, self._targets, label_names = load_dataset(ds_file)
- # elif ds_name == 'PAH':
- # ds_file = current_path + '../../datasets/PAH/dataset.ds'
- # self._graphs, self._targets, label_names = load_dataset(ds_file)
- # elif ds_name == 'SYNTHETIC':
- # pass
- # elif ds_name == 'SYNTHETICnew':
- # ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt'
- # self._graphs, self._targets, label_names = load_dataset(ds_file)
- # elif ds_name == 'Synthie':
- # pass
- # else:
- # raise Exception('The dataset name "', ds_name, '" is not pre-defined.')
- #
- # self._node_labels = label_names['node_labels']
- # self._node_attrs = label_names['node_attrs']
- # self._edge_labels = label_names['edge_labels']
- # self._edge_attrs = label_names['edge_attrs']
- # self.clean_labels()
- #
-
- # def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]):
- # self._node_labels = node_labels
- # self._node_attrs = node_attrs
- # self._edge_labels = edge_labels
- # self._edge_attrs = edge_attrs
-
- #
- # def set_labels_attrs(self, node_labels=None, node_attrs=None, edge_labels=None, edge_attrs=None):
- # # @todo: remove labels which have only one possible values.
- # if node_labels is None:
- # self._node_labels = self._graphs[0].graph['node_labels']
- # # # graphs are considered node unlabeled if all nodes have the same label.
- # # infos.update({'node_labeled': is_nl if node_label_num > 1 else False})
- # if node_attrs is None:
- # self._node_attrs = self._graphs[0].graph['node_attrs']
- # # for G in Gn:
- # # for n in G.nodes(data=True):
- # # if 'attributes' in n[1]:
- # # return len(n[1]['attributes'])
- # # return 0
- # if edge_labels is None:
- # self._edge_labels = self._graphs[0].graph['edge_labels']
- # # # graphs are considered edge unlabeled if all edges have the same label.
- # # infos.update({'edge_labeled': is_el if edge_label_num > 1 else False})
- # if edge_attrs is None:
- # self._edge_attrs = self._graphs[0].graph['edge_attrs']
- # # for G in Gn:
- # # if nx.number_of_edges(G) > 0:
- # # for e in G.edges(data=True):
- # # if 'attributes' in e[2]:
- # # return len(e[2]['attributes'])
- # # return 0
- #
- #
- # def get_dataset_infos(self, keys=None, params=None):
- # """Computes and returns the structure and property information of the graph dataset.
- #
- # Parameters
- # ----------
- # keys : list, optional
- # A list of strings which indicate which informations will be returned. The
- # possible choices includes:
- #
- # 'substructures': sub-structures graphs contains, including 'linear', 'non
- # linear' and 'cyclic'.
- #
- # 'node_label_dim': whether vertices have symbolic labels.
- #
- # 'edge_label_dim': whether egdes have symbolic labels.
- #
- # 'directed': whether graphs in dataset are directed.
- #
- # 'dataset_size': number of graphs in dataset.
- #
- # 'total_node_num': total number of vertices of all graphs in dataset.
- #
- # 'ave_node_num': average number of vertices of graphs in dataset.
- #
- # 'min_node_num': minimum number of vertices of graphs in dataset.
- #
- # 'max_node_num': maximum number of vertices of graphs in dataset.
- #
- # 'total_edge_num': total number of edges of all graphs in dataset.
- #
- # 'ave_edge_num': average number of edges of graphs in dataset.
- #
- # 'min_edge_num': minimum number of edges of graphs in dataset.
- #
- # 'max_edge_num': maximum number of edges of graphs in dataset.
- #
- # 'ave_node_degree': average vertex degree of graphs in dataset.
- #
- # 'min_node_degree': minimum vertex degree of graphs in dataset.
- #
- # 'max_node_degree': maximum vertex degree of graphs in dataset.
- #
- # 'ave_fill_factor': average fill factor (number_of_edges /
- # (number_of_nodes ** 2)) of graphs in dataset.
- #
- # 'min_fill_factor': minimum fill factor of graphs in dataset.
- #
- # 'max_fill_factor': maximum fill factor of graphs in dataset.
- #
- # 'node_label_nums': list of numbers of symbolic vertex labels of graphs in dataset.
- #
- # 'edge_label_nums': list number of symbolic edge labels of graphs in dataset.
- #
- # 'node_attr_dim': number of dimensions of non-symbolic vertex labels.
- # Extracted from the 'attributes' attribute of graph nodes.
- #
- # 'edge_attr_dim': number of dimensions of non-symbolic edge labels.
- # Extracted from the 'attributes' attribute of graph edges.
- #
- # 'class_number': number of classes. Only available for classification problems.
- #
- # 'all_degree_entropy': the entropy of degree distribution of each graph.
- #
- # 'ave_degree_entropy': the average entropy of degree distribution of all graphs.
- #
- # All informations above will be returned if `keys` is not given.
- #
- # params: dict of dict, optional
- # A dictinary which contains extra parameters for each possible
- # element in ``keys``.
- #
- # Return
- # ------
- # dict
- # Information of the graph dataset keyed by `keys`.
- # """
- # infos = {}
- #
- # if keys == None:
- # keys = [
- # 'substructures',
- # 'node_label_dim',
- # 'edge_label_dim',
- # 'directed',
- # 'dataset_size',
- # 'total_node_num',
- # 'ave_node_num',
- # 'min_node_num',
- # 'max_node_num',
- # 'total_edge_num',
- # 'ave_edge_num',
- # 'min_edge_num',
- # 'max_edge_num',
- # 'ave_node_degree',
- # 'min_node_degree',
- # 'max_node_degree',
- # 'ave_fill_factor',
- # 'min_fill_factor',
- # 'max_fill_factor',
- # 'node_label_nums',
- # 'edge_label_nums',
- # 'node_attr_dim',
- # 'edge_attr_dim',
- # 'class_number',
- # 'all_degree_entropy',
- # 'ave_degree_entropy'
- # ]
- #
- # # dataset size
- # if 'dataset_size' in keys:
- # if self._dataset_size is None:
- # self._dataset_size = self._get_dataset_size()
- # infos['dataset_size'] = self._dataset_size
- #
- # # graph node number
- # if any(i in keys for i in ['total_node_num', 'ave_node_num', 'min_node_num', 'max_node_num']):
- # all_node_nums = self._get_all_node_nums()
-
- # if 'total_node_num' in keys:
- # if self._total_node_num is None:
- # self._total_node_num = self._get_total_node_num(all_node_nums)
- # infos['total_node_num'] = self._total_node_num
- #
- # if 'ave_node_num' in keys:
- # if self._ave_node_num is None:
- # self._ave_node_num = self._get_ave_node_num(all_node_nums)
- # infos['ave_node_num'] = self._ave_node_num
- #
- # if 'min_node_num' in keys:
- # if self._min_node_num is None:
- # self._min_node_num = self._get_min_node_num(all_node_nums)
- # infos['min_node_num'] = self._min_node_num
- #
- # if 'max_node_num' in keys:
- # if self._max_node_num is None:
- # self._max_node_num = self._get_max_node_num(all_node_nums)
- # infos['max_node_num'] = self._max_node_num
- #
- # # graph edge number
- # if any(i in keys for i in ['total_edge_num', 'ave_edge_num', 'min_edge_num', 'max_edge_num']):
- # all_edge_nums = self._get_all_edge_nums()
-
- # if 'total_edge_num' in keys:
- # if self._total_edge_num is None:
- # self._total_edge_num = self._get_total_edge_num(all_edge_nums)
- # infos['total_edge_num'] = self._total_edge_num
- #
- # if 'ave_edge_num' in keys:
- # if self._ave_edge_num is None:
- # self._ave_edge_num = self._get_ave_edge_num(all_edge_nums)
- # infos['ave_edge_num'] = self._ave_edge_num
- #
- # if 'max_edge_num' in keys:
- # if self._max_edge_num is None:
- # self._max_edge_num = self._get_max_edge_num(all_edge_nums)
- # infos['max_edge_num'] = self._max_edge_num
-
- # if 'min_edge_num' in keys:
- # if self._min_edge_num is None:
- # self._min_edge_num = self._get_min_edge_num(all_edge_nums)
- # infos['min_edge_num'] = self._min_edge_num
- #
- # # label number
- # if 'node_label_dim' in keys:
- # if self._node_label_dim is None:
- # self._node_label_dim = self._get_node_label_dim()
- # infos['node_label_dim'] = self._node_label_dim
- #
- # if 'node_label_nums' in keys:
- # if self._node_label_nums is None:
- # self._node_label_nums = {}
- # for node_label in self._node_labels:
- # self._node_label_nums[node_label] = self._get_node_label_num(node_label)
- # infos['node_label_nums'] = self._node_label_nums
- #
- # if 'edge_label_dim' in keys:
- # if self._edge_label_dim is None:
- # self._edge_label_dim = self._get_edge_label_dim()
- # infos['edge_label_dim'] = self._edge_label_dim
- #
- # if 'edge_label_nums' in keys:
- # if self._edge_label_nums is None:
- # self._edge_label_nums = {}
- # for edge_label in self._edge_labels:
- # self._edge_label_nums[edge_label] = self._get_edge_label_num(edge_label)
- # infos['edge_label_nums'] = self._edge_label_nums
- #
- # if 'directed' in keys or 'substructures' in keys:
- # if self._directed is None:
- # self._directed = self._is_directed()
- # infos['directed'] = self._directed
- #
- # # node degree
- # if any(i in keys for i in ['ave_node_degree', 'max_node_degree', 'min_node_degree']):
- # all_node_degrees = self._get_all_node_degrees()
- #
- # if 'ave_node_degree' in keys:
- # if self._ave_node_degree is None:
- # self._ave_node_degree = self._get_ave_node_degree(all_node_degrees)
- # infos['ave_node_degree'] = self._ave_node_degree
- #
- # if 'max_node_degree' in keys:
- # if self._max_node_degree is None:
- # self._max_node_degree = self._get_max_node_degree(all_node_degrees)
- # infos['max_node_degree'] = self._max_node_degree
- #
- # if 'min_node_degree' in keys:
- # if self._min_node_degree is None:
- # self._min_node_degree = self._get_min_node_degree(all_node_degrees)
- # infos['min_node_degree'] = self._min_node_degree
- #
- # # fill factor
- # if any(i in keys for i in ['ave_fill_factor', 'max_fill_factor', 'min_fill_factor']):
- # all_fill_factors = self._get_all_fill_factors()
- #
- # if 'ave_fill_factor' in keys:
- # if self._ave_fill_factor is None:
- # self._ave_fill_factor = self._get_ave_fill_factor(all_fill_factors)
- # infos['ave_fill_factor'] = self._ave_fill_factor
- #
- # if 'max_fill_factor' in keys:
- # if self._max_fill_factor is None:
- # self._max_fill_factor = self._get_max_fill_factor(all_fill_factors)
- # infos['max_fill_factor'] = self._max_fill_factor
- #
- # if 'min_fill_factor' in keys:
- # if self._min_fill_factor is None:
- # self._min_fill_factor = self._get_min_fill_factor(all_fill_factors)
- # infos['min_fill_factor'] = self._min_fill_factor
- #
- # if 'substructures' in keys:
- # if self._substructures is None:
- # self._substructures = self._get_substructures()
- # infos['substructures'] = self._substructures
- #
- # if 'class_number' in keys:
- # if self._class_number is None:
- # self._class_number = self._get_class_number()
- # infos['class_number'] = self._class_number
- #
- # if 'node_attr_dim' in keys:
- # if self._node_attr_dim is None:
- # self._node_attr_dim = self._get_node_attr_dim()
- # infos['node_attr_dim'] = self._node_attr_dim
- #
- # if 'edge_attr_dim' in keys:
- # if self._edge_attr_dim is None:
- # self._edge_attr_dim = self._get_edge_attr_dim()
- # infos['edge_attr_dim'] = self._edge_attr_dim
- #
- # # entropy of degree distribution.
- #
- # if 'all_degree_entropy' in keys:
- # if params is not None and ('all_degree_entropy' in params) and ('base' in params['all_degree_entropy']):
- # base = params['all_degree_entropy']['base']
- # else:
- # base = None
- # infos['all_degree_entropy'] = self._compute_all_degree_entropy(base=base)
- #
- # if 'ave_degree_entropy' in keys:
- # if params is not None and ('ave_degree_entropy' in params) and ('base' in params['ave_degree_entropy']):
- # base = params['ave_degree_entropy']['base']
- # else:
- # base = None
- # infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base))
- #
- # return infos
- #
- #
- # def print_graph_infos(self, infos):
- # from collections import OrderedDict
- # keys = list(infos.keys())
- # print(OrderedDict(sorted(infos.items(), key=lambda i: keys.index(i[0]))))
- #
- #
- # def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]):
- # node_labels = [item for item in node_labels if item in self._node_labels]
- # edge_labels = [item for item in edge_labels if item in self._edge_labels]
- # node_attrs = [item for item in node_attrs if item in self._node_attrs]
- # edge_attrs = [item for item in edge_attrs if item in self._edge_attrs]
-
- # for g in self._graphs:
- # for nd in g.nodes():
- # for nl in node_labels:
- # del g.nodes[nd][nl]
- # for na in node_attrs:
- # del g.nodes[nd][na]
- # for ed in g.edges():
- # for el in edge_labels:
- # del g.edges[ed][el]
- # for ea in edge_attrs:
- # del g.edges[ed][ea]
- # if len(node_labels) > 0:
- # self._node_labels = [nl for nl in self._node_labels if nl not in node_labels]
- # if len(edge_labels) > 0:
- # self._edge_labels = [el for el in self._edge_labels if el not in edge_labels]
- # if len(node_attrs) > 0:
- # self._node_attrs = [na for na in self._node_attrs if na not in node_attrs]
- # if len(edge_attrs) > 0:
- # self._edge_attrs = [ea for ea in self._edge_attrs if ea not in edge_attrs]
- #
- #
- # def clean_labels(self):
- # labels = []
- # for name in self._node_labels:
- # label = set()
- # for G in self._graphs:
- # label = label | set(nx.get_node_attributes(G, name).values())
- # if len(label) > 1:
- # labels.append(name)
- # break
- # if len(label) < 2:
- # for G in self._graphs:
- # for nd in G.nodes():
- # del G.nodes[nd][name]
- # self._node_labels = labels
-
- # labels = []
- # for name in self._edge_labels:
- # label = set()
- # for G in self._graphs:
- # label = label | set(nx.get_edge_attributes(G, name).values())
- # if len(label) > 1:
- # labels.append(name)
- # break
- # if len(label) < 2:
- # for G in self._graphs:
- # for ed in G.edges():
- # del G.edges[ed][name]
- # self._edge_labels = labels
-
- # labels = []
- # for name in self._node_attrs:
- # label = set()
- # for G in self._graphs:
- # label = label | set(nx.get_node_attributes(G, name).values())
- # if len(label) > 1:
- # labels.append(name)
- # break
- # if len(label) < 2:
- # for G in self._graphs:
- # for nd in G.nodes():
- # del G.nodes[nd][name]
- # self._node_attrs = labels
-
- # labels = []
- # for name in self._edge_attrs:
- # label = set()
- # for G in self._graphs:
- # label = label | set(nx.get_edge_attributes(G, name).values())
- # if len(label) > 1:
- # labels.append(name)
- # break
- # if len(label) < 2:
- # for G in self._graphs:
- # for ed in G.edges():
- # del G.edges[ed][name]
- # self._edge_attrs = labels
- #
- #
- # def cut_graphs(self, range_):
- # self._graphs = [self._graphs[i] for i in range_]
- # if self._targets is not None:
- # self._targets = [self._targets[i] for i in range_]
- # self.clean_labels()
-
-
- # def trim_dataset(self, edge_required=False):
- # if edge_required:
- # trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if (nx.number_of_nodes(g) != 0 and nx.number_of_edges(g) != 0)]
- # else:
- # trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if nx.number_of_nodes(g) != 0]
- # idx = [p[0] for p in trimed_pairs]
- # self._graphs = [p[1] for p in trimed_pairs]
- # self._targets = [self._targets[i] for i in idx]
- # self.clean_labels()
- #
- #
- # def copy(self):
- # dataset = Dataset()
- # graphs = [g.copy() for g in self._graphs] if self._graphs is not None else None
- # target = self._targets.copy() if self._targets is not None else None
- # node_labels = self._node_labels.copy() if self._node_labels is not None else None
- # node_attrs = self._node_attrs.copy() if self._node_attrs is not None else None
- # edge_labels = self._edge_labels.copy() if self._edge_labels is not None else None
- # edge_attrs = self._edge_attrs.copy() if self._edge_attrs is not None else None
- # dataset.load_graphs(graphs, target)
- # dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs)
- # # @todo: clean_labels and add other class members?
- # return dataset
- #
- #
- # def get_all_node_labels(self):
- # node_labels = []
- # for g in self._graphs:
- # for n in g.nodes():
- # nl = tuple(g.nodes[n].items())
- # if nl not in node_labels:
- # node_labels.append(nl)
- # return node_labels
- #
- #
- # def get_all_edge_labels(self):
- # edge_labels = []
- # for g in self._graphs:
- # for e in g.edges():
- # el = tuple(g.edges[e].items())
- # if el not in edge_labels:
- # edge_labels.append(el)
- # return edge_labels
- #
- #
- # def _get_dataset_size(self):
- # return len(self._graphs)
- #
- #
- # def _get_all_node_nums(self):
- # return [nx.number_of_nodes(G) for G in self._graphs]
- #
- #
- # def _get_total_node_nums(self, all_node_nums):
- # return np.sum(all_node_nums)
- #
- #
- # def _get_ave_node_num(self, all_node_nums):
- # return np.mean(all_node_nums)
- #
- #
- # def _get_min_node_num(self, all_node_nums):
- # return np.amin(all_node_nums)
- #
- #
- # def _get_max_node_num(self, all_node_nums):
- # return np.amax(all_node_nums)
- #
- #
- # def _get_all_edge_nums(self):
- # return [nx.number_of_edges(G) for G in self._graphs]
- #
- #
- # def _get_total_edge_nums(self, all_edge_nums):
- # return np.sum(all_edge_nums)
- #
- #
- # def _get_ave_edge_num(self, all_edge_nums):
- # return np.mean(all_edge_nums)
- #
- #
- # def _get_min_edge_num(self, all_edge_nums):
- # return np.amin(all_edge_nums)
- #
- #
- # def _get_max_edge_num(self, all_edge_nums):
- # return np.amax(all_edge_nums)
- #
- #
- # def _get_node_label_dim(self):
- # return len(self._node_labels)
- #
- #
- # def _get_node_label_num(self, node_label):
- # nl = set()
- # for G in self._graphs:
- # nl = nl | set(nx.get_node_attributes(G, node_label).values())
- # return len(nl)
- #
- #
- # def _get_edge_label_dim(self):
- # return len(self._edge_labels)
- #
- #
- # def _get_edge_label_num(self, edge_label):
- # el = set()
- # for G in self._graphs:
- # el = el | set(nx.get_edge_attributes(G, edge_label).values())
- # return len(el)
- #
- #
- # def _is_directed(self):
- # return nx.is_directed(self._graphs[0])
- #
- #
- # def _get_all_node_degrees(self):
- # return [np.mean(list(dict(G.degree()).values())) for G in self._graphs]
- #
- #
- # def _get_ave_node_degree(self, all_node_degrees):
- # return np.mean(all_node_degrees)
- #
- #
- # def _get_max_node_degree(self, all_node_degrees):
- # return np.amax(all_node_degrees)
- #
- #
- # def _get_min_node_degree(self, all_node_degrees):
- # return np.amin(all_node_degrees)
- #
- #
- # def _get_all_fill_factors(self):
- # """Get fill factor, the number of non-zero entries in the adjacency matrix.
-
- # Returns
- # -------
- # list[float]
- # List of fill factors for all graphs.
- # """
- # return [nx.number_of_edges(G) / (nx.number_of_nodes(G) ** 2) for G in self._graphs]
- #
-
- # def _get_ave_fill_factor(self, all_fill_factors):
- # return np.mean(all_fill_factors)
- #
- #
- # def _get_max_fill_factor(self, all_fill_factors):
- # return np.amax(all_fill_factors)
- #
- #
- # def _get_min_fill_factor(self, all_fill_factors):
- # return np.amin(all_fill_factors)
- #
- #
- # def _get_substructures(self):
- # subs = set()
- # for G in self._graphs:
- # degrees = list(dict(G.degree()).values())
- # if any(i == 2 for i in degrees):
- # subs.add('linear')
- # if np.amax(degrees) >= 3:
- # subs.add('non linear')
- # if 'linear' in subs and 'non linear' in subs:
- # break
-
- # if self._directed:
- # for G in self._graphs:
- # if len(list(nx.find_cycle(G))) > 0:
- # subs.add('cyclic')
- # break
- # # else:
- # # # @todo: this method does not work for big graph with large amount of edges like D&D, try a better way.
- # # upper = np.amin([nx.number_of_edges(G) for G in Gn]) * 2 + 10
- # # for G in Gn:
- # # if (nx.number_of_edges(G) < upper):
- # # cyc = list(nx.simple_cycles(G.to_directed()))
- # # if any(len(i) > 2 for i in cyc):
- # # subs.add('cyclic')
- # # break
- # # if 'cyclic' not in subs:
- # # for G in Gn:
- # # cyc = list(nx.simple_cycles(G.to_directed()))
- # # if any(len(i) > 2 for i in cyc):
- # # subs.add('cyclic')
- # # break
- #
- # return subs
- #
- #
- # def _get_class_num(self):
- # return len(set(self._targets))
- #
- #
- # def _get_node_attr_dim(self):
- # return len(self._node_attrs)
- #
- #
- # def _get_edge_attr_dim(self):
- # return len(self._edge_attrs)
-
- #
- # def _compute_all_degree_entropy(self, base=None):
- # """Compute the entropy of degree distribution of each graph.
-
- # Parameters
- # ----------
- # base : float, optional
- # The logarithmic base to use. The default is ``e`` (natural logarithm).
-
- # Returns
- # -------
- # degree_entropy : float
- # The calculated entropy.
- # """
- # from gklearn.utils.stats import entropy
- #
- # degree_entropy = []
- # for g in self._graphs:
- # degrees = list(dict(g.degree()).values())
- # en = entropy(degrees, base=base)
- # degree_entropy.append(en)
- # return degree_entropy
- #
- #
- # @property
- # def graphs(self):
- # return self._graphs
-
-
- # @property
- # def targets(self):
- # return self._targets
- #
- #
- # @property
- # def node_labels(self):
- # return self._node_labels
-
-
- # @property
- # def edge_labels(self):
- # return self._edge_labels
- #
- #
- # @property
- # def node_attrs(self):
- # return self._node_attrs
- #
- #
- # @property
- # def edge_attrs(self):
- # return self._edge_attrs
- #
- #
- # def split_dataset_by_target(dataset):
- # from gklearn.preimage.utils import get_same_item_indices
- #
- # graphs = dataset.graphs
- # targets = dataset.targets
- # datasets = []
- # idx_targets = get_same_item_indices(targets)
- # for key, val in idx_targets.items():
- # sub_graphs = [graphs[i] for i in val]
- # sub_dataset = Dataset()
- # sub_dataset.load_graphs(sub_graphs, [key] * len(val))
- # node_labels = dataset.node_labels.copy() if dataset.node_labels is not None else None
- # node_attrs = dataset.node_attrs.copy() if dataset.node_attrs is not None else None
- # edge_labels = dataset.edge_labels.copy() if dataset.edge_labels is not None else None
- # edge_attrs = dataset.edge_attrs.copy() if dataset.edge_attrs is not None else None
- # sub_dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs)
- # datasets.append(sub_dataset)
- # # @todo: clean_labels?
- # return datasets
|