Update comments of tool functions.

6 years ago · 221b528cb5
--- a/pygraph/utils/graphdataset.py
+++ b/pygraph/utils/graphdataset.py
@@ -7,6 +7,57 @@ def get_dataset_attributes(Gn,
                           attr_names=[],
                           node_label=None,
                           edge_label=None):
    """Returns the structure and property information of the graph dataset Gn.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs whose information will be returned.
    target : list
        The list of classification targets corresponding to Gn. Only works for 
        classification problems.
    attr_names : list
        List of strings which indicate which informations will be returned. The
        possible choices includes:
        'substructures': sub-structures Gn contains, including 'linear', 'non 
            linear' and 'cyclic'.
        'node_labeled': whether vertices have symbolic labels.
        'edge_labeled': whether egdes have symbolic labels.
        'is_directed': whether graphs in Gn are directed.
        'dataset_size': number of graphs in Gn.
        'ave_node_num': average number of vertices of graphs in Gn.
        'min_node_num': minimum number of vertices of graphs in Gn.
        'max_node_num': maximum number of vertices of graphs in Gn.
        'ave_edge_num': average number of edges of graphs in Gn.
        'min_edge_num': minimum number of edges of graphs in Gn.
        'max_edge_num': maximum number of edges of graphs in Gn.
        'ave_node_degree': average vertex degree of graphs in Gn.
        'min_node_degree': minimum vertex degree of graphs in Gn.
        'max_node_degree': maximum vertex degree of graphs in Gn.
        'ave_fill_factor': average fill factor (number_of_edges / 
            (number_of_nodes ** 2)) of graphs in Gn.
        'min_fill_factor': minimum fill factor of graphs in Gn.
        'max_fill_factor': maximum fill factor of graphs in Gn.
        'node_label_num': number of symbolic vertex labels.
        'edge_label_num': number of symbolic edge labels.
        'node_attr_dim': number of dimensions of non-symbolic vertex labels. 
            Extracted from the 'attributes' attribute of graph nodes.
        'edge_attr_dim': number of dimensions of non-symbolic edge labels.
            Extracted from the 'attributes' attribute of graph edges.
        'class_number': number of classes. Only available for classification 
            problems.
    node_label : string
        Node attribute used as label. The default node label is atom. Mandatory
        when 'node_labeled' or 'node_label_num' is required.
    edge_label : string
        Edge attribute used as label. The default edge label is bond_type. 
        Mandatory when 'edge_labeled' or 'edge_label_num' is required.

    Return
    ------
    attrs : dict
        Value for each property.
    """
    import networkx as nx
    import numpy as np

--- a/pygraph/utils/graphfiles.py
+++ b/pygraph/utils/graphfiles.py
@@ -372,7 +372,39 @@ def loadTXT(dirname_dataset):


 def loadDataset(filename, filename_y=None, extra_params=None):
    """load file list of the dataset.
    """Read graph data from filename and load them as NetworkX graphs.

    Parameters
    ----------
    filename : string
        The name of the file from where the dataset is read.
    filename_y : string
        The name of file of the targets corresponding to graphs.
    extra_params : dict
        Extra parameters only designated to '.mat' format.

    Return
    ------
    data : List of NetworkX graph.
    y : List
        Targets corresponding to graphs.
        
    Notes
    -----
    This function supports following graph dataset formats:
    'ds': load data from .ct file. See comments of function loadCT for a example.
    'cxl': load data from Graph eXchange Language file (.cxl file). See 
        http://www.gupro.de/GXL/Introduction/background.html, 2019 for detail.
    'sdf': load data from structured data file (.sdf file). See 
        http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx, 
        2018 for details.
    'mat': Load graph data from a MATLAB (up to version 7.1) .mat file. See
        README in downloadable file in http://mlcb.is.tuebingen.mpg.de/Mitarbeiter/Nino/WL/, 
        2018 for details.
    'txt': Load graph data from a special .txt file. See
        https://ls11-www.cs.tu-dortmund.de/staff/morris/graphkerneldatasets,
        2019 for details. Note here filename is the name of either .txt file in
        the dataset directory.
    """
    from os.path import dirname, splitext

--- a/pygraph/utils/model_selection_precomputed.py
+++ b/pygraph/utils/model_selection_precomputed.py
@@ -34,7 +34,9 @@ def model_selection_for_precomputed_kernel(datafile,
                                           n_jobs=1,
                                           read_gm_from_file=False,
                                           verbose=True):
    """Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results.
    """Perform model selection, fitting and testing for precomputed kernels 
    using nested CV. Print out neccessary data during the process then finally 
    the results.

    Parameters
    ----------
@@ -43,17 +45,31 @@ def model_selection_for_precomputed_kernel(datafile,
    estimator : function
        kernel function used to estimate. This function needs to return a gram matrix.
    param_grid_precomputed : dictionary
        Dictionary with names (string) of parameters used to calculate gram matrices as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. Params with length 1 will be omitted.
        Dictionary with names (string) of parameters used to calculate gram 
        matrices as keys and lists of parameter settings to try as values. This 
        enables searching over any sequence of parameter settings. Params with 
        length 1 will be omitted.
    param_grid : dictionary
        Dictionary with names (string) of parameters used as penelties as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. Params with length 1 will be omitted.
        Dictionary with names (string) of parameters used as penelties as keys 
        and lists of parameter settings to try as values. This enables 
        searching over any sequence of parameter settings. Params with length 1
        will be omitted.
    model_type : string
        Typr of the problem, can be regression or classification.
        Typr of the problem, can be 'regression' or 'classification'.
    NUM_TRIALS : integer
        Number of random trials of outer cv loop. The default is 30.
    datafile_y : string
        Path of file storing y data. This parameter is optional depending on the given dataset file.
        Path of file storing y data. This parameter is optional depending on 
        the given dataset file.
    extra_params : dict
        Extra parameters for loading dataset. See function pygraph.utils.
        graphfiles.loadDataset for detail.
    ds_name : string
        Name of the dataset.
    n_jobs : int
        Number of jobs for parallelization.
    read_gm_from_file : boolean
        Whether gram matrices are loaded from file.
        Whether gram matrices are loaded from a file.

    Examples
    --------
@@ -61,14 +77,18 @@ def model_selection_for_precomputed_kernel(datafile,
    >>> import sys
    >>> sys.path.insert(0, "../")
    >>> from pygraph.utils.model_selection_precomputed import model_selection_for_precomputed_kernel
    >>> from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel
    >>> from pygraph.kernels.untilHPathKernel import untilhpathkernel
    >>>
    >>> datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'
    >>> estimator = weisfeilerlehmankernel
    >>> param_grid_precomputed = {'height': [0,1,2,3,4,5,6,7,8,9,10], 'base_kernel': ['subtree']}
    >>> param_grid = {"alpha": np.logspace(-2, 2, num = 10, base = 10)}
    >>> datafile = '../datasets/MUTAG/MUTAG_A.txt'
    >>> estimator = untilhpathkernel
    >>> param_grid_precomputed = {’depth’:  np.linspace(1, 10, 10), ’k_func’:
            [’MinMax’, ’tanimoto’], ’compute_method’:  [’trie’]}
    >>> # ’C’ for classification problems and ’alpha’ for regression problems.
    >>> param_grid = [{’C’: np.logspace(-10, 10, num=41, base=10)}, {’alpha’:
            np.logspace(-10, 10, num=41, base=10)}]
    >>>
    >>> model_selection_for_precomputed_kernel(datafile, estimator, param_grid_precomputed, param_grid, 'regression')
    >>> model_selection_for_precomputed_kernel(datafile, estimator, 
            param_grid_precomputed, param_grid[0], 'classification', ds_name=’MUTAG’)
    """
    tqdm.monitor_interval = 0