From 221b528cb5ef523629ebc1125d82ce1a8719610a Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Wed, 17 Jul 2019 15:11:55 +0200 Subject: [PATCH] Update comments of tool functions. --- pygraph/utils/graphdataset.py | 51 ++++++++++++++++++++++++++++ pygraph/utils/graphfiles.py | 34 ++++++++++++++++++- pygraph/utils/model_selection_precomputed.py | 44 +++++++++++++++++------- 3 files changed, 116 insertions(+), 13 deletions(-) diff --git a/pygraph/utils/graphdataset.py b/pygraph/utils/graphdataset.py index 4ca2c15..f74532e 100644 --- a/pygraph/utils/graphdataset.py +++ b/pygraph/utils/graphdataset.py @@ -7,6 +7,57 @@ def get_dataset_attributes(Gn, attr_names=[], node_label=None, edge_label=None): + """Returns the structure and property information of the graph dataset Gn. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs whose information will be returned. + target : list + The list of classification targets corresponding to Gn. Only works for + classification problems. + attr_names : list + List of strings which indicate which informations will be returned. The + possible choices includes: + 'substructures': sub-structures Gn contains, including 'linear', 'non + linear' and 'cyclic'. + 'node_labeled': whether vertices have symbolic labels. + 'edge_labeled': whether egdes have symbolic labels. + 'is_directed': whether graphs in Gn are directed. + 'dataset_size': number of graphs in Gn. + 'ave_node_num': average number of vertices of graphs in Gn. + 'min_node_num': minimum number of vertices of graphs in Gn. + 'max_node_num': maximum number of vertices of graphs in Gn. + 'ave_edge_num': average number of edges of graphs in Gn. + 'min_edge_num': minimum number of edges of graphs in Gn. + 'max_edge_num': maximum number of edges of graphs in Gn. + 'ave_node_degree': average vertex degree of graphs in Gn. + 'min_node_degree': minimum vertex degree of graphs in Gn. + 'max_node_degree': maximum vertex degree of graphs in Gn. + 'ave_fill_factor': average fill factor (number_of_edges / + (number_of_nodes ** 2)) of graphs in Gn. + 'min_fill_factor': minimum fill factor of graphs in Gn. + 'max_fill_factor': maximum fill factor of graphs in Gn. + 'node_label_num': number of symbolic vertex labels. + 'edge_label_num': number of symbolic edge labels. + 'node_attr_dim': number of dimensions of non-symbolic vertex labels. + Extracted from the 'attributes' attribute of graph nodes. + 'edge_attr_dim': number of dimensions of non-symbolic edge labels. + Extracted from the 'attributes' attribute of graph edges. + 'class_number': number of classes. Only available for classification + problems. + node_label : string + Node attribute used as label. The default node label is atom. Mandatory + when 'node_labeled' or 'node_label_num' is required. + edge_label : string + Edge attribute used as label. The default edge label is bond_type. + Mandatory when 'edge_labeled' or 'edge_label_num' is required. + + Return + ------ + attrs : dict + Value for each property. + """ import networkx as nx import numpy as np diff --git a/pygraph/utils/graphfiles.py b/pygraph/utils/graphfiles.py index 621976b..b084a8d 100644 --- a/pygraph/utils/graphfiles.py +++ b/pygraph/utils/graphfiles.py @@ -372,7 +372,39 @@ def loadTXT(dirname_dataset): def loadDataset(filename, filename_y=None, extra_params=None): - """load file list of the dataset. + """Read graph data from filename and load them as NetworkX graphs. + + Parameters + ---------- + filename : string + The name of the file from where the dataset is read. + filename_y : string + The name of file of the targets corresponding to graphs. + extra_params : dict + Extra parameters only designated to '.mat' format. + + Return + ------ + data : List of NetworkX graph. + y : List + Targets corresponding to graphs. + + Notes + ----- + This function supports following graph dataset formats: + 'ds': load data from .ct file. See comments of function loadCT for a example. + 'cxl': load data from Graph eXchange Language file (.cxl file). See + http://www.gupro.de/GXL/Introduction/background.html, 2019 for detail. + 'sdf': load data from structured data file (.sdf file). See + http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx, + 2018 for details. + 'mat': Load graph data from a MATLAB (up to version 7.1) .mat file. See + README in downloadable file in http://mlcb.is.tuebingen.mpg.de/Mitarbeiter/Nino/WL/, + 2018 for details. + 'txt': Load graph data from a special .txt file. See + https://ls11-www.cs.tu-dortmund.de/staff/morris/graphkerneldatasets, + 2019 for details. Note here filename is the name of either .txt file in + the dataset directory. """ from os.path import dirname, splitext diff --git a/pygraph/utils/model_selection_precomputed.py b/pygraph/utils/model_selection_precomputed.py index 9644085..9543cbe 100644 --- a/pygraph/utils/model_selection_precomputed.py +++ b/pygraph/utils/model_selection_precomputed.py @@ -34,7 +34,9 @@ def model_selection_for_precomputed_kernel(datafile, n_jobs=1, read_gm_from_file=False, verbose=True): - """Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results. + """Perform model selection, fitting and testing for precomputed kernels + using nested CV. Print out neccessary data during the process then finally + the results. Parameters ---------- @@ -43,17 +45,31 @@ def model_selection_for_precomputed_kernel(datafile, estimator : function kernel function used to estimate. This function needs to return a gram matrix. param_grid_precomputed : dictionary - Dictionary with names (string) of parameters used to calculate gram matrices as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. Params with length 1 will be omitted. + Dictionary with names (string) of parameters used to calculate gram + matrices as keys and lists of parameter settings to try as values. This + enables searching over any sequence of parameter settings. Params with + length 1 will be omitted. param_grid : dictionary - Dictionary with names (string) of parameters used as penelties as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. Params with length 1 will be omitted. + Dictionary with names (string) of parameters used as penelties as keys + and lists of parameter settings to try as values. This enables + searching over any sequence of parameter settings. Params with length 1 + will be omitted. model_type : string - Typr of the problem, can be regression or classification. + Typr of the problem, can be 'regression' or 'classification'. NUM_TRIALS : integer Number of random trials of outer cv loop. The default is 30. datafile_y : string - Path of file storing y data. This parameter is optional depending on the given dataset file. + Path of file storing y data. This parameter is optional depending on + the given dataset file. + extra_params : dict + Extra parameters for loading dataset. See function pygraph.utils. + graphfiles.loadDataset for detail. + ds_name : string + Name of the dataset. + n_jobs : int + Number of jobs for parallelization. read_gm_from_file : boolean - Whether gram matrices are loaded from file. + Whether gram matrices are loaded from a file. Examples -------- @@ -61,14 +77,18 @@ def model_selection_for_precomputed_kernel(datafile, >>> import sys >>> sys.path.insert(0, "../") >>> from pygraph.utils.model_selection_precomputed import model_selection_for_precomputed_kernel - >>> from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel + >>> from pygraph.kernels.untilHPathKernel import untilhpathkernel >>> - >>> datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds' - >>> estimator = weisfeilerlehmankernel - >>> param_grid_precomputed = {'height': [0,1,2,3,4,5,6,7,8,9,10], 'base_kernel': ['subtree']} - >>> param_grid = {"alpha": np.logspace(-2, 2, num = 10, base = 10)} + >>> datafile = '../datasets/MUTAG/MUTAG_A.txt' + >>> estimator = untilhpathkernel + >>> param_grid_precomputed = {’depth’: np.linspace(1, 10, 10), ’k_func’: + [’MinMax’, ’tanimoto’], ’compute_method’: [’trie’]} + >>> # ’C’ for classification problems and ’alpha’ for regression problems. + >>> param_grid = [{’C’: np.logspace(-10, 10, num=41, base=10)}, {’alpha’: + np.logspace(-10, 10, num=41, base=10)}] >>> - >>> model_selection_for_precomputed_kernel(datafile, estimator, param_grid_precomputed, param_grid, 'regression') + >>> model_selection_for_precomputed_kernel(datafile, estimator, + param_grid_precomputed, param_grid[0], 'classification', ds_name=’MUTAG’) """ tqdm.monitor_interval = 0