Browse Source

Check and update datasets for exp of comparing fcsp.

v0.2.x
jajupmochi 4 years ago
parent
commit
41015c2413
4 changed files with 218 additions and 194 deletions
  1. +1
    -1
      gklearn/dataset/data_fetcher.py
  2. +200
    -186
      gklearn/dataset/dataset.py
  3. +17
    -5
      gklearn/experiments/thesis/graph_kernels/fcsp/compare_fcsp.py
  4. +0
    -2
      gklearn/kernels/structural_sp.py

+ 1
- 1
gklearn/dataset/data_fetcher.py View File

@@ -71,7 +71,7 @@ class DataFetcher():
print('Finished.', str(sum(v is not None for v in self._path)), 'of', str(len(self._path)), 'datasets are successfully fetched.') print('Finished.', str(sum(v is not None for v in self._path)), 'of', str(len(self._path)), 'datasets are successfully fetched.')


elif self._name not in DATASET_META: elif self._name not in DATASET_META:
message = 'Invalid Dataset name "' + self._name + '".'
message = 'Invalid dataset name "' + self._name + '".'
message += '\nAvailable datasets are as follows: \n\n' message += '\nAvailable datasets are as follows: \n\n'
message += '\n'.join(ds for ds in sorted(DATASET_META)) message += '\n'.join(ds for ds in sorted(DATASET_META))
message += '\n\nFollowing special suffices can be added to the name:' message += '\n\nFollowing special suffices can be added to the name:'


+ 200
- 186
gklearn/dataset/dataset.py View File

@@ -12,8 +12,8 @@ from gklearn.dataset import DATASET_META, DataFetcher, DataLoader




class Dataset(object): class Dataset(object):
def __init__(self, inputs=None, root='datasets', filename_targets=None, targets=None, mode='networkx', clean_labels=True, reload=False, verbose=False, **kwargs): def __init__(self, inputs=None, root='datasets', filename_targets=None, targets=None, mode='networkx', clean_labels=True, reload=False, verbose=False, **kwargs):
self._substructures = None self._substructures = None
self._node_label_dim = None self._node_label_dim = None
@@ -40,7 +40,7 @@ class Dataset(object):
self._edge_attr_dim = None self._edge_attr_dim = None
self._class_number = None self._class_number = None
self._ds_name = None self._ds_name = None
if inputs is None: if inputs is None:
self._graphs = None self._graphs = None
self._targets = None self._targets = None
@@ -48,7 +48,7 @@ class Dataset(object):
self._edge_labels = None self._edge_labels = None
self._node_attrs = None self._node_attrs = None
self._edge_attrs = None self._edge_attrs = None
# If inputs is a list of graphs. # If inputs is a list of graphs.
elif isinstance(inputs, list): elif isinstance(inputs, list):
node_labels = kwargs.get('node_labels', None) node_labels = kwargs.get('node_labels', None)
@@ -59,33 +59,31 @@ class Dataset(object):
self.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs) self.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs)
if clean_labels: if clean_labels:
self.clean_labels() self.clean_labels()
elif isinstance(inputs, str): elif isinstance(inputs, str):
# If inputs is predefined dataset name. # If inputs is predefined dataset name.
if inputs in DATASET_META: if inputs in DATASET_META:
self.load_predefined_dataset(inputs, root=root, clean_labels=clean_labels, reload=reload, verbose=verbose) self.load_predefined_dataset(inputs, root=root, clean_labels=clean_labels, reload=reload, verbose=verbose)
self._ds_name = inputs self._ds_name = inputs
elif inputs.endswith('_unlabeled'):
self.load_predefined_dataset(inputs[:len(inputs) - 10], root=root, clean_labels=clean_labels, reload=reload, verbose=verbose)

# If the dataset is specially defined, i.g., Alkane_unlabeled, MAO_lite.
elif self.is_special_dataset(inputs):
self.load_special_dataset(inputs, root, clean_labels, reload, verbose)
self._ds_name = inputs self._ds_name = inputs


# Deal with special suffices.
self.check_special_suffices()
# If inputs is a file name. # If inputs is a file name.
elif os.path.isfile(inputs): elif os.path.isfile(inputs):
self.load_dataset(inputs, filename_targets=filename_targets, clean_labels=clean_labels, **kwargs)
self.load_dataset(inputs, filename_targets=filename_targets, clean_labels=clean_labels, **kwargs)
# If inputs is a file name. # If inputs is a file name.
else: else:
raise ValueError('The "inputs" argument "' + inputs + '" is not a valid dataset name or file name.') raise ValueError('The "inputs" argument "' + inputs + '" is not a valid dataset name or file name.')
else: else:
raise TypeError('The "inputs" argument cannot be recognized. "Inputs" can be a list of graphs, a predefined dataset name, or a file name of a dataset.') raise TypeError('The "inputs" argument cannot be recognized. "Inputs" can be a list of graphs, a predefined dataset name, or a file name of a dataset.')
def load_dataset(self, filename, filename_targets=None, clean_labels=True, **kwargs): def load_dataset(self, filename, filename_targets=None, clean_labels=True, **kwargs):
self._graphs, self._targets, label_names = DataLoader(filename, filename_targets=filename_targets, **kwargs).data self._graphs, self._targets, label_names = DataLoader(filename, filename_targets=filename_targets, **kwargs).data
self._node_labels = label_names['node_labels'] self._node_labels = label_names['node_labels']
@@ -94,18 +92,18 @@ class Dataset(object):
self._edge_attrs = label_names['edge_attrs'] self._edge_attrs = label_names['edge_attrs']
if clean_labels: if clean_labels:
self.clean_labels() self.clean_labels()
def load_graphs(self, graphs, targets=None): def load_graphs(self, graphs, targets=None):
# this has to be followed by set_labels(). # this has to be followed by set_labels().
self._graphs = graphs self._graphs = graphs
self._targets = targets self._targets = targets
# self.set_labels_attrs() # @todo # self.set_labels_attrs() # @todo
def load_predefined_dataset(self, ds_name, root='datasets', clean_labels=True, reload=False, verbose=False): def load_predefined_dataset(self, ds_name, root='datasets', clean_labels=True, reload=False, verbose=False):
path = DataFetcher(name=ds_name, root=root, reload=reload, verbose=verbose).path path = DataFetcher(name=ds_name, root=root, reload=reload, verbose=verbose).path
if DATASET_META[ds_name]['database'] == 'tudataset': if DATASET_META[ds_name]['database'] == 'tudataset':
ds_file = os.path.join(path, ds_name + '_A.txt') ds_file = os.path.join(path, ds_name + '_A.txt')
fn_targets = None fn_targets = None
@@ -116,21 +114,21 @@ class Dataset(object):
else: # load_files[0] is a list of files. else: # load_files[0] is a list of files.
ds_file = [os.path.join(path, fn) for fn in load_files[0]] ds_file = [os.path.join(path, fn) for fn in load_files[0]]
fn_targets = os.path.join(path, load_files[1]) if len(load_files) == 2 else None fn_targets = os.path.join(path, load_files[1]) if len(load_files) == 2 else None
self._graphs, self._targets, label_names = DataLoader(ds_file, filename_targets=fn_targets).data self._graphs, self._targets, label_names = DataLoader(ds_file, filename_targets=fn_targets).data
self._node_labels = label_names['node_labels'] self._node_labels = label_names['node_labels']
self._node_attrs = label_names['node_attrs'] self._node_attrs = label_names['node_attrs']
self._edge_labels = label_names['edge_labels'] self._edge_labels = label_names['edge_labels']
self._edge_attrs = label_names['edge_attrs'] self._edge_attrs = label_names['edge_attrs']
if clean_labels: if clean_labels:
self.clean_labels() self.clean_labels()
# Deal with specific datasets. # Deal with specific datasets.
if ds_name == 'Alkane': if ds_name == 'Alkane':
self.trim_dataset(edge_required=True) self.trim_dataset(edge_required=True)
self.remove_labels(node_labels=['atom_symbol']) self.remove_labels(node_labels=['atom_symbol'])


def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]): def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]):
self._node_labels = node_labels self._node_labels = node_labels
@@ -138,7 +136,7 @@ class Dataset(object):
self._edge_labels = edge_labels self._edge_labels = edge_labels
self._edge_attrs = edge_attrs self._edge_attrs = edge_attrs


def set_labels_attrs(self, node_labels=None, node_attrs=None, edge_labels=None, edge_attrs=None): def set_labels_attrs(self, node_labels=None, node_attrs=None, edge_labels=None, edge_attrs=None):
# @todo: remove labels which have only one possible values. # @todo: remove labels which have only one possible values.
if node_labels is None: if node_labels is None:
@@ -164,86 +162,86 @@ class Dataset(object):
# if 'attributes' in e[2]: # if 'attributes' in e[2]:
# return len(e[2]['attributes']) # return len(e[2]['attributes'])
# return 0 # return 0
def get_dataset_infos(self, keys=None, params=None): def get_dataset_infos(self, keys=None, params=None):
"""Computes and returns the structure and property information of the graph dataset. """Computes and returns the structure and property information of the graph dataset.
Parameters Parameters
---------- ----------
keys : list, optional keys : list, optional
A list of strings which indicate which informations will be returned. The A list of strings which indicate which informations will be returned. The
possible choices includes: possible choices includes:
'substructures': sub-structures graphs contains, including 'linear', 'non
'substructures': sub-structures graphs contains, including 'linear', 'non
linear' and 'cyclic'. linear' and 'cyclic'.
'node_label_dim': whether vertices have symbolic labels. 'node_label_dim': whether vertices have symbolic labels.
'edge_label_dim': whether egdes have symbolic labels. 'edge_label_dim': whether egdes have symbolic labels.
'directed': whether graphs in dataset are directed. 'directed': whether graphs in dataset are directed.
'dataset_size': number of graphs in dataset. 'dataset_size': number of graphs in dataset.
'total_node_num': total number of vertices of all graphs in dataset. 'total_node_num': total number of vertices of all graphs in dataset.
'ave_node_num': average number of vertices of graphs in dataset. 'ave_node_num': average number of vertices of graphs in dataset.
'min_node_num': minimum number of vertices of graphs in dataset. 'min_node_num': minimum number of vertices of graphs in dataset.
'max_node_num': maximum number of vertices of graphs in dataset. 'max_node_num': maximum number of vertices of graphs in dataset.
'total_edge_num': total number of edges of all graphs in dataset. 'total_edge_num': total number of edges of all graphs in dataset.
'ave_edge_num': average number of edges of graphs in dataset. 'ave_edge_num': average number of edges of graphs in dataset.
'min_edge_num': minimum number of edges of graphs in dataset. 'min_edge_num': minimum number of edges of graphs in dataset.
'max_edge_num': maximum number of edges of graphs in dataset. 'max_edge_num': maximum number of edges of graphs in dataset.
'ave_node_degree': average vertex degree of graphs in dataset. 'ave_node_degree': average vertex degree of graphs in dataset.
'min_node_degree': minimum vertex degree of graphs in dataset. 'min_node_degree': minimum vertex degree of graphs in dataset.
'max_node_degree': maximum vertex degree of graphs in dataset. 'max_node_degree': maximum vertex degree of graphs in dataset.
'ave_fill_factor': average fill factor (number_of_edges /
'ave_fill_factor': average fill factor (number_of_edges /
(number_of_nodes ** 2)) of graphs in dataset. (number_of_nodes ** 2)) of graphs in dataset.
'min_fill_factor': minimum fill factor of graphs in dataset. 'min_fill_factor': minimum fill factor of graphs in dataset.
'max_fill_factor': maximum fill factor of graphs in dataset. 'max_fill_factor': maximum fill factor of graphs in dataset.
'node_label_nums': list of numbers of symbolic vertex labels of graphs in dataset. 'node_label_nums': list of numbers of symbolic vertex labels of graphs in dataset.
'edge_label_nums': list number of symbolic edge labels of graphs in dataset. 'edge_label_nums': list number of symbolic edge labels of graphs in dataset.
'node_attr_dim': number of dimensions of non-symbolic vertex labels.
'node_attr_dim': number of dimensions of non-symbolic vertex labels.
Extracted from the 'attributes' attribute of graph nodes. Extracted from the 'attributes' attribute of graph nodes.
'edge_attr_dim': number of dimensions of non-symbolic edge labels.
'edge_attr_dim': number of dimensions of non-symbolic edge labels.
Extracted from the 'attributes' attribute of graph edges. Extracted from the 'attributes' attribute of graph edges.
'class_number': number of classes. Only available for classification problems. 'class_number': number of classes. Only available for classification problems.
'all_degree_entropy': the entropy of degree distribution of each graph. 'all_degree_entropy': the entropy of degree distribution of each graph.
'ave_degree_entropy': the average entropy of degree distribution of all graphs. 'ave_degree_entropy': the average entropy of degree distribution of all graphs.
All informations above will be returned if `keys` is not given. All informations above will be returned if `keys` is not given.
params: dict of dict, optional params: dict of dict, optional
A dictinary which contains extra parameters for each possible
A dictinary which contains extra parameters for each possible
element in ``keys``. element in ``keys``.
Return Return
------ ------
dict dict
Information of the graph dataset keyed by `keys`. Information of the graph dataset keyed by `keys`.
""" """
infos = {} infos = {}
if keys == None: if keys == None:
keys = [ keys = [
'substructures', 'substructures',
@@ -273,13 +271,13 @@ class Dataset(object):
'all_degree_entropy', 'all_degree_entropy',
'ave_degree_entropy' 'ave_degree_entropy'
] ]
# dataset size # dataset size
if 'dataset_size' in keys: if 'dataset_size' in keys:
if self._dataset_size is None: if self._dataset_size is None:
self._dataset_size = self._get_dataset_size() self._dataset_size = self._get_dataset_size()
infos['dataset_size'] = self._dataset_size infos['dataset_size'] = self._dataset_size
# graph node number # graph node number
if any(i in keys for i in ['total_node_num', 'ave_node_num', 'min_node_num', 'max_node_num']): if any(i in keys for i in ['total_node_num', 'ave_node_num', 'min_node_num', 'max_node_num']):
all_node_nums = self._get_all_node_nums() all_node_nums = self._get_all_node_nums()
@@ -288,22 +286,22 @@ class Dataset(object):
if self._total_node_num is None: if self._total_node_num is None:
self._total_node_num = self._get_total_node_num(all_node_nums) self._total_node_num = self._get_total_node_num(all_node_nums)
infos['total_node_num'] = self._total_node_num infos['total_node_num'] = self._total_node_num
if 'ave_node_num' in keys: if 'ave_node_num' in keys:
if self._ave_node_num is None: if self._ave_node_num is None:
self._ave_node_num = self._get_ave_node_num(all_node_nums) self._ave_node_num = self._get_ave_node_num(all_node_nums)
infos['ave_node_num'] = self._ave_node_num infos['ave_node_num'] = self._ave_node_num
if 'min_node_num' in keys: if 'min_node_num' in keys:
if self._min_node_num is None: if self._min_node_num is None:
self._min_node_num = self._get_min_node_num(all_node_nums) self._min_node_num = self._get_min_node_num(all_node_nums)
infos['min_node_num'] = self._min_node_num infos['min_node_num'] = self._min_node_num
if 'max_node_num' in keys: if 'max_node_num' in keys:
if self._max_node_num is None: if self._max_node_num is None:
self._max_node_num = self._get_max_node_num(all_node_nums) self._max_node_num = self._get_max_node_num(all_node_nums)
infos['max_node_num'] = self._max_node_num infos['max_node_num'] = self._max_node_num
# graph edge number # graph edge number
if any(i in keys for i in ['total_edge_num', 'ave_edge_num', 'min_edge_num', 'max_edge_num']): if any(i in keys for i in ['total_edge_num', 'ave_edge_num', 'min_edge_num', 'max_edge_num']):
all_edge_nums = self._get_all_edge_nums() all_edge_nums = self._get_all_edge_nums()
@@ -312,12 +310,12 @@ class Dataset(object):
if self._total_edge_num is None: if self._total_edge_num is None:
self._total_edge_num = self._get_total_edge_num(all_edge_nums) self._total_edge_num = self._get_total_edge_num(all_edge_nums)
infos['total_edge_num'] = self._total_edge_num infos['total_edge_num'] = self._total_edge_num
if 'ave_edge_num' in keys: if 'ave_edge_num' in keys:
if self._ave_edge_num is None: if self._ave_edge_num is None:
self._ave_edge_num = self._get_ave_edge_num(all_edge_nums) self._ave_edge_num = self._get_ave_edge_num(all_edge_nums)
infos['ave_edge_num'] = self._ave_edge_num infos['ave_edge_num'] = self._ave_edge_num
if 'max_edge_num' in keys: if 'max_edge_num' in keys:
if self._max_edge_num is None: if self._max_edge_num is None:
self._max_edge_num = self._get_max_edge_num(all_edge_nums) self._max_edge_num = self._get_max_edge_num(all_edge_nums)
@@ -327,120 +325,120 @@ class Dataset(object):
if self._min_edge_num is None: if self._min_edge_num is None:
self._min_edge_num = self._get_min_edge_num(all_edge_nums) self._min_edge_num = self._get_min_edge_num(all_edge_nums)
infos['min_edge_num'] = self._min_edge_num infos['min_edge_num'] = self._min_edge_num
# label number # label number
if 'node_label_dim' in keys: if 'node_label_dim' in keys:
if self._node_label_dim is None: if self._node_label_dim is None:
self._node_label_dim = self._get_node_label_dim() self._node_label_dim = self._get_node_label_dim()
infos['node_label_dim'] = self._node_label_dim
infos['node_label_dim'] = self._node_label_dim
if 'node_label_nums' in keys: if 'node_label_nums' in keys:
if self._node_label_nums is None: if self._node_label_nums is None:
self._node_label_nums = {} self._node_label_nums = {}
for node_label in self._node_labels: for node_label in self._node_labels:
self._node_label_nums[node_label] = self._get_node_label_num(node_label) self._node_label_nums[node_label] = self._get_node_label_num(node_label)
infos['node_label_nums'] = self._node_label_nums infos['node_label_nums'] = self._node_label_nums
if 'edge_label_dim' in keys: if 'edge_label_dim' in keys:
if self._edge_label_dim is None: if self._edge_label_dim is None:
self._edge_label_dim = self._get_edge_label_dim() self._edge_label_dim = self._get_edge_label_dim()
infos['edge_label_dim'] = self._edge_label_dim
infos['edge_label_dim'] = self._edge_label_dim
if 'edge_label_nums' in keys: if 'edge_label_nums' in keys:
if self._edge_label_nums is None: if self._edge_label_nums is None:
self._edge_label_nums = {} self._edge_label_nums = {}
for edge_label in self._edge_labels: for edge_label in self._edge_labels:
self._edge_label_nums[edge_label] = self._get_edge_label_num(edge_label) self._edge_label_nums[edge_label] = self._get_edge_label_num(edge_label)
infos['edge_label_nums'] = self._edge_label_nums infos['edge_label_nums'] = self._edge_label_nums
if 'directed' in keys or 'substructures' in keys: if 'directed' in keys or 'substructures' in keys:
if self._directed is None: if self._directed is None:
self._directed = self._is_directed() self._directed = self._is_directed()
infos['directed'] = self._directed infos['directed'] = self._directed
# node degree # node degree
if any(i in keys for i in ['ave_node_degree', 'max_node_degree', 'min_node_degree']): if any(i in keys for i in ['ave_node_degree', 'max_node_degree', 'min_node_degree']):
all_node_degrees = self._get_all_node_degrees() all_node_degrees = self._get_all_node_degrees()
if 'ave_node_degree' in keys: if 'ave_node_degree' in keys:
if self._ave_node_degree is None: if self._ave_node_degree is None:
self._ave_node_degree = self._get_ave_node_degree(all_node_degrees) self._ave_node_degree = self._get_ave_node_degree(all_node_degrees)
infos['ave_node_degree'] = self._ave_node_degree infos['ave_node_degree'] = self._ave_node_degree
if 'max_node_degree' in keys: if 'max_node_degree' in keys:
if self._max_node_degree is None: if self._max_node_degree is None:
self._max_node_degree = self._get_max_node_degree(all_node_degrees) self._max_node_degree = self._get_max_node_degree(all_node_degrees)
infos['max_node_degree'] = self._max_node_degree infos['max_node_degree'] = self._max_node_degree
if 'min_node_degree' in keys: if 'min_node_degree' in keys:
if self._min_node_degree is None: if self._min_node_degree is None:
self._min_node_degree = self._get_min_node_degree(all_node_degrees) self._min_node_degree = self._get_min_node_degree(all_node_degrees)
infos['min_node_degree'] = self._min_node_degree infos['min_node_degree'] = self._min_node_degree
# fill factor # fill factor
if any(i in keys for i in ['ave_fill_factor', 'max_fill_factor', 'min_fill_factor']): if any(i in keys for i in ['ave_fill_factor', 'max_fill_factor', 'min_fill_factor']):
all_fill_factors = self._get_all_fill_factors() all_fill_factors = self._get_all_fill_factors()
if 'ave_fill_factor' in keys: if 'ave_fill_factor' in keys:
if self._ave_fill_factor is None: if self._ave_fill_factor is None:
self._ave_fill_factor = self._get_ave_fill_factor(all_fill_factors) self._ave_fill_factor = self._get_ave_fill_factor(all_fill_factors)
infos['ave_fill_factor'] = self._ave_fill_factor infos['ave_fill_factor'] = self._ave_fill_factor
if 'max_fill_factor' in keys: if 'max_fill_factor' in keys:
if self._max_fill_factor is None: if self._max_fill_factor is None:
self._max_fill_factor = self._get_max_fill_factor(all_fill_factors) self._max_fill_factor = self._get_max_fill_factor(all_fill_factors)
infos['max_fill_factor'] = self._max_fill_factor infos['max_fill_factor'] = self._max_fill_factor
if 'min_fill_factor' in keys: if 'min_fill_factor' in keys:
if self._min_fill_factor is None: if self._min_fill_factor is None:
self._min_fill_factor = self._get_min_fill_factor(all_fill_factors) self._min_fill_factor = self._get_min_fill_factor(all_fill_factors)
infos['min_fill_factor'] = self._min_fill_factor infos['min_fill_factor'] = self._min_fill_factor
if 'substructures' in keys: if 'substructures' in keys:
if self._substructures is None: if self._substructures is None:
self._substructures = self._get_substructures() self._substructures = self._get_substructures()
infos['substructures'] = self._substructures infos['substructures'] = self._substructures
if 'class_number' in keys: if 'class_number' in keys:
if self._class_number is None: if self._class_number is None:
self._class_number = self._get_class_number() self._class_number = self._get_class_number()
infos['class_number'] = self._class_number infos['class_number'] = self._class_number
if 'node_attr_dim' in keys: if 'node_attr_dim' in keys:
if self._node_attr_dim is None: if self._node_attr_dim is None:
self._node_attr_dim = self._get_node_attr_dim() self._node_attr_dim = self._get_node_attr_dim()
infos['node_attr_dim'] = self._node_attr_dim infos['node_attr_dim'] = self._node_attr_dim
if 'edge_attr_dim' in keys: if 'edge_attr_dim' in keys:
if self._edge_attr_dim is None: if self._edge_attr_dim is None:
self._edge_attr_dim = self._get_edge_attr_dim() self._edge_attr_dim = self._get_edge_attr_dim()
infos['edge_attr_dim'] = self._edge_attr_dim infos['edge_attr_dim'] = self._edge_attr_dim
# entropy of degree distribution. # entropy of degree distribution.
if 'all_degree_entropy' in keys: if 'all_degree_entropy' in keys:
if params is not None and ('all_degree_entropy' in params) and ('base' in params['all_degree_entropy']): if params is not None and ('all_degree_entropy' in params) and ('base' in params['all_degree_entropy']):
base = params['all_degree_entropy']['base'] base = params['all_degree_entropy']['base']
else: else:
base = None base = None
infos['all_degree_entropy'] = self._compute_all_degree_entropy(base=base) infos['all_degree_entropy'] = self._compute_all_degree_entropy(base=base)
if 'ave_degree_entropy' in keys: if 'ave_degree_entropy' in keys:
if params is not None and ('ave_degree_entropy' in params) and ('base' in params['ave_degree_entropy']): if params is not None and ('ave_degree_entropy' in params) and ('base' in params['ave_degree_entropy']):
base = params['ave_degree_entropy']['base'] base = params['ave_degree_entropy']['base']
else: else:
base = None base = None
infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base)) infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base))
return infos return infos
def print_graph_infos(self, infos): def print_graph_infos(self, infos):
from collections import OrderedDict from collections import OrderedDict
keys = list(infos.keys()) keys = list(infos.keys())
print(OrderedDict(sorted(infos.items(), key=lambda i: keys.index(i[0])))) print(OrderedDict(sorted(infos.items(), key=lambda i: keys.index(i[0]))))
def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]):
node_labels = [item for item in node_labels if item in self._node_labels] node_labels = [item for item in node_labels if item in self._node_labels]
edge_labels = [item for item in edge_labels if item in self._edge_labels] edge_labels = [item for item in edge_labels if item in self._edge_labels]
@@ -466,8 +464,8 @@ class Dataset(object):
self._node_attrs = [na for na in self._node_attrs if na not in node_attrs] self._node_attrs = [na for na in self._node_attrs if na not in node_attrs]
if len(edge_attrs) > 0: if len(edge_attrs) > 0:
self._edge_attrs = [ea for ea in self._edge_attrs if ea not in edge_attrs] self._edge_attrs = [ea for ea in self._edge_attrs if ea not in edge_attrs]
def clean_labels(self): def clean_labels(self):
labels = [] labels = []
for name in self._node_labels: for name in self._node_labels:
@@ -524,8 +522,8 @@ class Dataset(object):
for ed in G.edges(): for ed in G.edges():
del G.edges[ed][name] del G.edges[ed][name]
self._edge_attrs = labels self._edge_attrs = labels
def cut_graphs(self, range_): def cut_graphs(self, range_):
self._graphs = [self._graphs[i] for i in range_] self._graphs = [self._graphs[i] for i in range_]
if self._targets is not None: if self._targets is not None:
@@ -542,8 +540,8 @@ class Dataset(object):
self._graphs = [p[1] for p in trimed_pairs] self._graphs = [p[1] for p in trimed_pairs]
self._targets = [self._targets[i] for i in idx] self._targets = [self._targets[i] for i in idx]
self.clean_labels() self.clean_labels()
def copy(self): def copy(self):
dataset = Dataset() dataset = Dataset()
graphs = [g.copy() for g in self._graphs] if self._graphs is not None else None graphs = [g.copy() for g in self._graphs] if self._graphs is not None else None
@@ -556,16 +554,32 @@ class Dataset(object):
dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs) dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs)
# @todo: clean_labels and add other class members? # @todo: clean_labels and add other class members?
return dataset return dataset
def check_special_suffices(self):
if self._ds_name.endswith('_unlabeled'):
self.remove_labels(node_labels=self._node_labels,
edge_labels=self._edge_labels,
node_attrs=self._node_attrs,


def is_special_dataset(self, inputs):
if inputs.endswith('_unlabeled'):
return True
if inputs == 'MAO_lite':
return True
return False


def load_special_dataset(self, inputs, root, clean_labels, reload, verbose):
if inputs.endswith('_unlabeled'):
self.load_predefined_dataset(inputs[:len(inputs) - 10], root=root, clean_labels=clean_labels, reload=reload, verbose=verbose)

self.remove_labels(node_labels=self._node_labels,
edge_labels=self._edge_labels,
node_attrs=self._node_attrs,
edge_attrs=self._edge_attrs) edge_attrs=self._edge_attrs)

elif inputs == 'MAO_lite':
self.load_predefined_dataset(inputs[:len(inputs) - 5], root=root, clean_labels=clean_labels, reload=reload, verbose=verbose)

self.remove_labels(edge_labels=['bond_stereo'], node_attrs=['x', 'y'])



def get_all_node_labels(self): def get_all_node_labels(self):
node_labels = [] node_labels = []
for g in self._graphs: for g in self._graphs:
@@ -574,8 +588,8 @@ class Dataset(object):
if nl not in node_labels: if nl not in node_labels:
node_labels.append(nl) node_labels.append(nl)
return node_labels return node_labels
def get_all_edge_labels(self): def get_all_edge_labels(self):
edge_labels = [] edge_labels = []
for g in self._graphs: for g in self._graphs:
@@ -584,94 +598,94 @@ class Dataset(object):
if el not in edge_labels: if el not in edge_labels:
edge_labels.append(el) edge_labels.append(el)
return edge_labels return edge_labels
def _get_dataset_size(self): def _get_dataset_size(self):
return len(self._graphs) return len(self._graphs)
def _get_all_node_nums(self): def _get_all_node_nums(self):
return [nx.number_of_nodes(G) for G in self._graphs] return [nx.number_of_nodes(G) for G in self._graphs]
def _get_total_node_nums(self, all_node_nums): def _get_total_node_nums(self, all_node_nums):
return np.sum(all_node_nums) return np.sum(all_node_nums)
def _get_ave_node_num(self, all_node_nums): def _get_ave_node_num(self, all_node_nums):
return np.mean(all_node_nums) return np.mean(all_node_nums)
def _get_min_node_num(self, all_node_nums): def _get_min_node_num(self, all_node_nums):
return np.amin(all_node_nums) return np.amin(all_node_nums)
def _get_max_node_num(self, all_node_nums): def _get_max_node_num(self, all_node_nums):
return np.amax(all_node_nums) return np.amax(all_node_nums)
def _get_all_edge_nums(self): def _get_all_edge_nums(self):
return [nx.number_of_edges(G) for G in self._graphs] return [nx.number_of_edges(G) for G in self._graphs]
def _get_total_edge_nums(self, all_edge_nums): def _get_total_edge_nums(self, all_edge_nums):
return np.sum(all_edge_nums) return np.sum(all_edge_nums)
def _get_ave_edge_num(self, all_edge_nums): def _get_ave_edge_num(self, all_edge_nums):
return np.mean(all_edge_nums) return np.mean(all_edge_nums)
def _get_min_edge_num(self, all_edge_nums): def _get_min_edge_num(self, all_edge_nums):
return np.amin(all_edge_nums) return np.amin(all_edge_nums)
def _get_max_edge_num(self, all_edge_nums): def _get_max_edge_num(self, all_edge_nums):
return np.amax(all_edge_nums) return np.amax(all_edge_nums)
def _get_node_label_dim(self): def _get_node_label_dim(self):
return len(self._node_labels) return len(self._node_labels)
def _get_node_label_num(self, node_label): def _get_node_label_num(self, node_label):
nl = set() nl = set()
for G in self._graphs: for G in self._graphs:
nl = nl | set(nx.get_node_attributes(G, node_label).values()) nl = nl | set(nx.get_node_attributes(G, node_label).values())
return len(nl) return len(nl)
def _get_edge_label_dim(self): def _get_edge_label_dim(self):
return len(self._edge_labels) return len(self._edge_labels)
def _get_edge_label_num(self, edge_label): def _get_edge_label_num(self, edge_label):
el = set() el = set()
for G in self._graphs: for G in self._graphs:
el = el | set(nx.get_edge_attributes(G, edge_label).values()) el = el | set(nx.get_edge_attributes(G, edge_label).values())
return len(el) return len(el)
def _is_directed(self): def _is_directed(self):
return nx.is_directed(self._graphs[0]) return nx.is_directed(self._graphs[0])
def _get_all_node_degrees(self): def _get_all_node_degrees(self):
return [np.mean(list(dict(G.degree()).values())) for G in self._graphs] return [np.mean(list(dict(G.degree()).values())) for G in self._graphs]
def _get_ave_node_degree(self, all_node_degrees): def _get_ave_node_degree(self, all_node_degrees):
return np.mean(all_node_degrees) return np.mean(all_node_degrees)
def _get_max_node_degree(self, all_node_degrees): def _get_max_node_degree(self, all_node_degrees):
return np.amax(all_node_degrees) return np.amax(all_node_degrees)
def _get_min_node_degree(self, all_node_degrees): def _get_min_node_degree(self, all_node_degrees):
return np.amin(all_node_degrees) return np.amin(all_node_degrees)
def _get_all_fill_factors(self): def _get_all_fill_factors(self):
"""Get fill factor, the number of non-zero entries in the adjacency matrix. """Get fill factor, the number of non-zero entries in the adjacency matrix.


@@ -681,20 +695,20 @@ class Dataset(object):
List of fill factors for all graphs. List of fill factors for all graphs.
""" """
return [nx.number_of_edges(G) / (nx.number_of_nodes(G) ** 2) for G in self._graphs] return [nx.number_of_edges(G) / (nx.number_of_nodes(G) ** 2) for G in self._graphs]


def _get_ave_fill_factor(self, all_fill_factors): def _get_ave_fill_factor(self, all_fill_factors):
return np.mean(all_fill_factors) return np.mean(all_fill_factors)
def _get_max_fill_factor(self, all_fill_factors): def _get_max_fill_factor(self, all_fill_factors):
return np.amax(all_fill_factors) return np.amax(all_fill_factors)
def _get_min_fill_factor(self, all_fill_factors): def _get_min_fill_factor(self, all_fill_factors):
return np.amin(all_fill_factors) return np.amin(all_fill_factors)
def _get_substructures(self): def _get_substructures(self):
subs = set() subs = set()
for G in self._graphs: for G in self._graphs:
@@ -726,22 +740,22 @@ class Dataset(object):
# if any(len(i) > 2 for i in cyc): # if any(len(i) > 2 for i in cyc):
# subs.add('cyclic') # subs.add('cyclic')
# break # break
return subs return subs
def _get_class_num(self): def _get_class_num(self):
return len(set(self._targets)) return len(set(self._targets))
def _get_node_attr_dim(self): def _get_node_attr_dim(self):
return len(self._node_attrs) return len(self._node_attrs)
def _get_edge_attr_dim(self): def _get_edge_attr_dim(self):
return len(self._edge_attrs) return len(self._edge_attrs)


def _compute_all_degree_entropy(self, base=None): def _compute_all_degree_entropy(self, base=None):
"""Compute the entropy of degree distribution of each graph. """Compute the entropy of degree distribution of each graph.


@@ -756,15 +770,15 @@ class Dataset(object):
The calculated entropy. The calculated entropy.
""" """
from gklearn.utils.stats import entropy from gklearn.utils.stats import entropy
degree_entropy = [] degree_entropy = []
for g in self._graphs: for g in self._graphs:
degrees = list(dict(g.degree()).values()) degrees = list(dict(g.degree()).values())
en = entropy(degrees, base=base) en = entropy(degrees, base=base)
degree_entropy.append(en) degree_entropy.append(en)
return degree_entropy return degree_entropy
@property @property
def graphs(self): def graphs(self):
return self._graphs return self._graphs
@@ -773,8 +787,8 @@ class Dataset(object):
@property @property
def targets(self): def targets(self):
return self._targets return self._targets
@property @property
def node_labels(self): def node_labels(self):
return self._node_labels return self._node_labels
@@ -783,21 +797,21 @@ class Dataset(object):
@property @property
def edge_labels(self): def edge_labels(self):
return self._edge_labels return self._edge_labels
@property @property
def node_attrs(self): def node_attrs(self):
return self._node_attrs return self._node_attrs
@property @property
def edge_attrs(self): def edge_attrs(self):
return self._edge_attrs return self._edge_attrs
def split_dataset_by_target(dataset): def split_dataset_by_target(dataset):
from gklearn.preimage.utils import get_same_item_indices from gklearn.preimage.utils import get_same_item_indices
graphs = dataset.graphs graphs = dataset.graphs
targets = dataset.targets targets = dataset.targets
datasets = [] datasets = []


+ 17
- 5
gklearn/experiments/thesis/graph_kernels/fcsp/compare_fcsp.py View File

@@ -14,6 +14,7 @@ import functools
import os import os
import pickle import pickle
import sys import sys
import logging




def run_all(fcsp): def run_all(fcsp):
@@ -23,13 +24,17 @@ def run_all(fcsp):
from sklearn.model_selection import ParameterGrid from sklearn.model_selection import ParameterGrid


Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO',
'PAH', 'MUTAG', 'Monoterpenoids', 'Letter-high',
'Letter-med', 'Letter-low',
'ENZYMES', 'AIDS_lite', 'AIDS', 'NCI1', 'NCI109', 'DD']
'PAH_unlabeled', 'PAH', 'MUTAG', 'Letter-high', 'Letter-med', 'Letter-low',
'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD',
'BZR', 'COX2', 'DHFR', 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR',
'Cuneiform', 'KKI', 'OHSU', 'Peking_1', 'SYNTHETICnew',
'Synthie', 'SYNTHETIC', 'Fingerprint', 'IMDB-BINARY',
'IMDB-MULTI', 'COIL-DEL', 'PROTEINS', 'PROTEINS_full',
'Mutagenicity', 'REDDIT-BINARY']


Kernel_List = ['ShortestPath', 'StructuralSP'] Kernel_List = ['ShortestPath', 'StructuralSP']


work_grid = ParameterGrid({'kernel': Kernel_List[0:], 'dataset': Dataset_List[2:3]})
work_grid = ParameterGrid({'kernel': Kernel_List[:], 'dataset': Dataset_List[:]})


for work in list(work_grid): for work in list(work_grid):


@@ -39,7 +44,14 @@ def run_all(fcsp):
print() print()
print((work['kernel'], work['dataset'])) print((work['kernel'], work['dataset']))


gram_matrix, run_time = run_work(work['kernel'], work['dataset'], fcsp)
try:
gram_matrix, run_time = run_work(work['kernel'], work['dataset'], fcsp)
except Exception as exp:
print('An exception occured when running this experiment:')
LOG_FILENAME = save_dir + 'error.txt'
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logging.exception(save_file_suffix)
print(repr(exp))


save_file_suffix = '.' + work['kernel'] + work['dataset'] save_file_suffix = '.' + work['kernel'] + work['dataset']




+ 0
- 2
gklearn/kernels/structural_sp.py View File

@@ -252,7 +252,6 @@ class StructuralSP(GraphKernel):
if not kpath: if not kpath:
break break
kernel += kpath # add up kernels of all paths kernel += kpath # add up kernels of all paths
# print(kernel, ',', p1, ',', p2)
else: else:
for p1, p2 in product(spl1, spl2): for p1, p2 in product(spl1, spl2):
if len(p1) == len(p2): if len(p1) == len(p2):
@@ -399,7 +398,6 @@ class StructuralSP(GraphKernel):
if not kpath: if not kpath:
break break
kernel += kpath # add up kernels of all paths kernel += kpath # add up kernels of all paths
# print(kernel, ',', p1, ',', p2)
else: else:
for p1, p2 in product(spl1, spl2): for p1, p2 in product(spl1, spl2):
if len(p1) == len(p2): if len(p1) == len(p2):


Loading…
Cancel
Save