Browse Source

Merge pull request #28 from jajupmochi/v0.2

V0.2
tags/v0.2.0
linlin GitHub 4 years ago
parent
commit
068b57afe7
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 599 additions and 82 deletions
  1. +3
    -5
      .appveyor.yml
  2. +0
    -1
      .travis.yml
  3. +13
    -13
      README.md
  4. +0
    -0
      gklearn/examples/ged/__init__.py
  5. +0
    -0
      gklearn/examples/ged/compute_graph_edit_distance.py
  6. +0
    -0
      gklearn/examples/kernels/__init__.py
  7. +0
    -0
      gklearn/examples/kernels/compute_distance_in_kernel_space.py
  8. +0
    -0
      gklearn/examples/kernels/compute_graph_kernel.py
  9. +31
    -0
      gklearn/examples/kernels/compute_graph_kernel_old.py
  10. +38
    -0
      gklearn/examples/kernels/model_selection_old.py
  11. +0
    -0
      gklearn/examples/preimage/__init__.py
  12. +0
    -0
      gklearn/examples/preimage/median_preimege_generator.py
  13. +113
    -0
      gklearn/examples/preimage/median_preimege_generator_cml.py
  14. +114
    -0
      gklearn/examples/preimage/median_preimege_generator_py.py
  15. +64
    -0
      gklearn/experiments/papers/PRL_2020/runtimes_28cores.py
  16. +13
    -3
      gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py
  17. +12
    -3
      gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py
  18. +12
    -3
      gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py
  19. +12
    -3
      gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py
  20. +12
    -3
      gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py
  21. +8
    -3
      gklearn/experiments/papers/PRL_2020/utils.py
  22. +22
    -11
      gklearn/kernels/common_walk.py
  23. +9
    -0
      gklearn/utils/dataset.py
  24. +81
    -0
      gklearn/utils/graphdataset.py
  25. +17
    -16
      gklearn/utils/model_selection_precomputed.py
  26. +1
    -1
      requirements_pypi.txt
  27. +24
    -17
      setup.py

+ 3
- 5
.appveyor.yml View File

@@ -1,7 +1,5 @@
environment:
matrix:
- PYTHON: "C:\\Python35"
- PYTHON: "C:\\Python35-x64"
- PYTHON: "C:\\Python36"
- PYTHON: "C:\\Python36-x64"
- PYTHON: "C:\\Python37"
@@ -17,12 +15,12 @@ environment:

install:
- "%PYTHON%\\python.exe -m pip install -U pip"
- "%PYTHON%\\python.exe -m pip install -U pytest"
- "%PYTHON%\\python.exe -m pip install -r requirements.txt"
- "%PYTHON%\\python.exe -m pip install wheel"
- "%PYTHON%\\python.exe -m pip install -r requirements.txt"
- "%PYTHON%\\python.exe -m pip install -U pytest"

build: off

test_script:
- "%PYTHON%\\python.exe setup.py bdist_wheel"
- "%PYTHON%\\python.exe -m pytest -v gklearn/tests/"
- "%PYTHON%\\python.exe -m pytest -v gklearn/tests/ --ignore=gklearn/tests/test_median_preimage_generator.py"

+ 0
- 1
.travis.yml View File

@@ -1,7 +1,6 @@
language: python

python:
- '3.5'
- '3.6'
- '3.7'
- '3.8'


+ 13
- 13
README.md View File

@@ -9,7 +9,7 @@ A Python package for graph kernels, graph edit distances and graph pre-image pro

## Requirements

* python>=3.5
* python>=3.6
* numpy>=1.16.2
* scipy>=1.1.0
* matplotlib>=3.1.0
@@ -65,27 +65,27 @@ The docs of the library can be found [here](https://graphkit-learn.readthedocs.i
### 1 List of graph kernels

* Based on walks
* [The common walk kernel](gklearn/kernels/common_walk.py) [1]
* [The common walk kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/common_walk.py) [1]
* Exponential
* Geometric
* [The marginalized kenrel](gklearn/kernels/marginalized.py)
* [The marginalized kenrel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/marginalized.py)
* With tottering [2]
* Without tottering [7]
* [The generalized random walk kernel](gklearn/kernels/random_walk.py) [3]
* [Sylvester equation](gklearn/kernels/sylvester_equation.py)
* [The generalized random walk kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/random_walk.py) [3]
* [Sylvester equation](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/sylvester_equation.py)
* Conjugate gradient
* Fixed-point iterations
* [Spectral decomposition](gklearn/kernels/spectral_decomposition.py)
* [Spectral decomposition](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/spectral_decomposition.py)
* Based on paths
* [The shortest path kernel](gklearn/kernels/shortest_path.py) [4]
* [The structural shortest path kernel](gklearn/kernels/structural_sp.py) [5]
* [The path kernel up to length h](gklearn/kernels/path_up_to_h.py) [6]
* [The shortest path kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/shortest_path.py) [4]
* [The structural shortest path kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/structural_sp.py) [5]
* [The path kernel up to length h](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/path_up_to_h.py) [6]
* The Tanimoto kernel
* The MinMax kernel
* Non-linear kernels
* [The treelet kernel](gklearn/kernels/treelet.py) [10]
* [Weisfeiler-Lehman kernel](gklearn/kernels/weisfeiler_lehman.py) [11]
* [Subtree](gklearn/kernels/weisfeiler_lehman.py#L479)
* [The treelet kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/treelet.py) [10]
* [Weisfeiler-Lehman kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/weisfeiler_lehman.py) [11]
* [Subtree](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/weisfeiler_lehman.py#L479)

A demo of computing graph kernels can be found on [Google Colab](https://colab.research.google.com/drive/17Q2QCl9CAtDweGF8LiWnWoN2laeJqT0u?usp=sharing) and in the [`examples`](https://github.com/jajupmochi/graphkit-learn/blob/master/gklearn/examples/compute_graph_kernel.py) folder.

@@ -97,7 +97,7 @@ A demo of generating graph preimages can be found on [Google Colab](https://cola

### 4 Interface to `GEDLIB`

[`GEDLIB`](https://github.com/dbblumenthal/gedlib) is an easily extensible C++ library for (suboptimally) computing the graph edit distance between attributed graphs. [A Python interface](gklearn/gedlib) for `GEDLIB` is integrated in this library, based on [`gedlibpy`](https://github.com/Ryurin/gedlibpy) library.
[`GEDLIB`](https://github.com/dbblumenthal/gedlib) is an easily extensible C++ library for (suboptimally) computing the graph edit distance between attributed graphs. [A Python interface](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/gedlib) for `GEDLIB` is integrated in this library, based on [`gedlibpy`](https://github.com/Ryurin/gedlibpy) library.

### 5 Computation optimization methods



+ 0
- 0
gklearn/examples/ged/__init__.py View File


gklearn/examples/compute_graph_edit_distance.py → gklearn/examples/ged/compute_graph_edit_distance.py View File


+ 0
- 0
gklearn/examples/kernels/__init__.py View File


gklearn/examples/compute_distance_in_kernel_space.py → gklearn/examples/kernels/compute_distance_in_kernel_space.py View File


gklearn/examples/compute_graph_kernel.py → gklearn/examples/kernels/compute_graph_kernel.py View File


+ 31
- 0
gklearn/examples/kernels/compute_graph_kernel_old.py View File

@@ -0,0 +1,31 @@
# -*- coding: utf-8 -*-
"""compute_graph_kernel_v0.1.ipynb

Automatically generated by Colaboratory.

Original file is located at
https://colab.research.google.com/drive/10jUz7-ahPiE_T1qvFrh2NvCVs1e47noj

**This script demonstrates how to compute a graph kernel.**
---

**0. Install `graphkit-learn`.**
"""

"""**1. Get dataset.**"""

from gklearn.utils.graphfiles import loadDataset

graphs, targets = loadDataset('../../../datasets/MUTAG/MUTAG_A.txt')

"""**2. Compute graph kernel.**"""

from gklearn.kernels import untilhpathkernel

gram_matrix, run_time = untilhpathkernel(
graphs, # The list of input graphs.
depth=5, # The longest length of paths.
k_func='MinMax', # Or 'tanimoto'.
compute_method='trie', # Or 'naive'.
n_jobs=1, # The number of jobs to run in parallel.
verbose=True)

+ 38
- 0
gklearn/examples/kernels/model_selection_old.py View File

@@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
"""model_selection_old.ipynb

Automatically generated by Colaboratory.

Original file is located at
https://colab.research.google.com/drive/1uVkl7scNgEPrimX8ks6iEC5ijuhB8L_D

**This script demonstrates how to compute a graph kernel.**
---

**0. Install `graphkit-learn`.**
"""

"""**1. Perform model seletion and classification.**"""

from gklearn.utils import model_selection_for_precomputed_kernel
from gklearn.kernels import untilhpathkernel
import numpy as np

# Set parameters.
datafile = '../../../datasets/MUTAG/MUTAG_A.txt'
param_grid_precomputed = {'depth': np.linspace(1, 10, 10),
'k_func': ['MinMax', 'tanimoto'],
'compute_method': ['trie']}
param_grid = {'C': np.logspace(-10, 10, num=41, base=10)}

# Perform model selection and classification.
model_selection_for_precomputed_kernel(
datafile, # The path of dataset file.
untilhpathkernel, # The graph kernel used for estimation.
param_grid_precomputed, # The parameters used to compute gram matrices.
param_grid, # The penelty Parameters used for penelty items.
'classification', # Or 'regression'.
NUM_TRIALS=30, # The number of the random trials of the outer CV loop.
ds_name='MUTAG', # The name of the dataset.
n_jobs=1,
verbose=True)

+ 0
- 0
gklearn/examples/preimage/__init__.py View File


gklearn/examples/median_preimege_generator.py → gklearn/examples/preimage/median_preimege_generator.py View File


+ 113
- 0
gklearn/examples/preimage/median_preimege_generator_cml.py View File

@@ -0,0 +1,113 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 16 15:41:26 2020

@author: ljia

**This script demonstrates how to generate a graph preimage using Boria's method with cost matrices learning.**
"""

"""**1. Get dataset.**"""

from gklearn.utils import Dataset, split_dataset_by_target

# Predefined dataset name, use dataset "MAO".
ds_name = 'MAO'
# The node/edge labels that will not be used in the computation.
irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']}

# Initialize a Dataset.
dataset_all = Dataset()
# Load predefined dataset "MAO".
dataset_all.load_predefined_dataset(ds_name)
# Remove irrelevant labels.
dataset_all.remove_labels(**irrelevant_labels)
# Split the whole dataset according to the classification targets.
datasets = split_dataset_by_target(dataset_all)
# Get the first class of graphs, whose median preimage will be computed.
dataset = datasets[0]
len(dataset.graphs)

"""**2. Set parameters.**"""

import multiprocessing

# Parameters for MedianPreimageGenerator (our method).
mpg_options = {'init_method': 'random', # how to initialize node label cost vector. "random" means to initialize randomly.
'init_ecc': [4, 4, 2, 1, 1, 1], # initial edit costs.
'ds_name': ds_name, # name of the dataset.
'parallel': True, # @todo: whether the parallel scheme is to be used.
'time_limit_in_sec': 0, # maximum time limit to compute the preimage. If set to 0 then no limit.
'max_itrs': 3, # maximum iteration limit to optimize edit costs. If set to 0 then no limit.
'max_itrs_without_update': 3, # If the times that edit costs is not update is more than this number, then the optimization stops.
'epsilon_residual': 0.01, # In optimization, the residual is only considered changed if the change is bigger than this number.
'epsilon_ec': 0.1, # In optimization, the edit costs are only considered changed if the changes are bigger than this number.
'verbose': 2 # whether to print out results.
}
# Parameters for graph kernel computation.
kernel_options = {'name': 'PathUpToH', # use path kernel up to length h.
'depth': 9,
'k_func': 'MinMax',
'compute_method': 'trie',
'parallel': 'imap_unordered', # or None
'n_jobs': multiprocessing.cpu_count(),
'normalize': True, # whether to use normalized Gram matrix to optimize edit costs.
'verbose': 2 # whether to print out results.
}
# Parameters for GED computation.
ged_options = {'method': 'BIPARTITE', # use Bipartite huristic.
'initialization_method': 'RANDOM', # or 'NODE', etc.
'initial_solutions': 10, # when bigger than 1, then the method is considered mIPFP.
'edit_cost': 'CONSTANT', # @todo: not needed. use CONSTANT cost.
'attr_distance': 'euclidean', # @todo: not needed. the distance between non-symbolic node/edge labels is computed by euclidean distance.
'ratio_runs_from_initial_solutions': 1,
'threads': multiprocessing.cpu_count(), # parallel threads. Do not work if mpg_options['parallel'] = False.
'init_option': 'LAZY_WITHOUT_SHUFFLED_COPIES' # 'EAGER_WITHOUT_SHUFFLED_COPIES'
}
# Parameters for MedianGraphEstimator (Boria's method).
mge_options = {'init_type': 'MEDOID', # how to initial median (compute set-median). "MEDOID" is to use the graph with smallest SOD.
'random_inits': 10, # number of random initialization when 'init_type' = 'RANDOM'.
'time_limit': 600, # maximum time limit to compute the generalized median. If set to 0 then no limit.
'verbose': 2, # whether to print out results.
'refine': False # whether to refine the final SODs or not.
}
print('done.')

"""**3. Run median preimage generator.**"""

from gklearn.preimage import MedianPreimageGeneratorCML

# Create median preimage generator instance.
mpg = MedianPreimageGeneratorCML()
# Add dataset.
mpg.dataset = dataset
# Set parameters.
mpg.set_options(**mpg_options.copy())
mpg.kernel_options = kernel_options.copy()
mpg.ged_options = ged_options.copy()
mpg.mge_options = mge_options.copy()
# Run.
mpg.run()

"""**4. Get results.**"""

# Get results.
import pprint
pp = pprint.PrettyPrinter(indent=4) # pretty print
results = mpg.get_results()
pp.pprint(results)

# Draw generated graphs.
def draw_graph(graph):
import matplotlib.pyplot as plt
import networkx as nx
plt.figure()
pos = nx.spring_layout(graph)
nx.draw(graph, pos, node_size=500, labels=nx.get_node_attributes(graph, 'atom_symbol'), font_color='w', width=3, with_labels=True)
plt.show()
plt.clf()
plt.close()
draw_graph(mpg.set_median)
draw_graph(mpg.gen_median)

+ 114
- 0
gklearn/examples/preimage/median_preimege_generator_py.py View File

@@ -0,0 +1,114 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 16 15:41:26 2020

@author: ljia

**This script demonstrates how to generate a graph preimage using Boria's method with cost matrices learning.**
"""

"""**1. Get dataset.**"""

from gklearn.utils import Dataset, split_dataset_by_target

# Predefined dataset name, use dataset "MAO".
ds_name = 'MAO'
# The node/edge labels that will not be used in the computation.
irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']}

# Initialize a Dataset.
dataset_all = Dataset()
# Load predefined dataset "MAO".
dataset_all.load_predefined_dataset(ds_name)
# Remove irrelevant labels.
dataset_all.remove_labels(**irrelevant_labels)
# Split the whole dataset according to the classification targets.
datasets = split_dataset_by_target(dataset_all)
# Get the first class of graphs, whose median preimage will be computed.
dataset = datasets[0]
# dataset.cut_graphs(range(0, 10))
len(dataset.graphs)

"""**2. Set parameters.**"""

import multiprocessing

# Parameters for MedianPreimageGenerator (our method).
mpg_options = {'fit_method': 'k-graphs', # how to fit edit costs. "k-graphs" means use all graphs in median set when fitting.
'init_ecc': [4, 4, 2, 1, 1, 1], # initial edit costs.
'ds_name': ds_name, # name of the dataset.
'parallel': True, # @todo: whether the parallel scheme is to be used.
'time_limit_in_sec': 0, # maximum time limit to compute the preimage. If set to 0 then no limit.
'max_itrs': 100, # maximum iteration limit to optimize edit costs. If set to 0 then no limit.
'max_itrs_without_update': 3, # If the times that edit costs is not update is more than this number, then the optimization stops.
'epsilon_residual': 0.01, # In optimization, the residual is only considered changed if the change is bigger than this number.
'epsilon_ec': 0.1, # In optimization, the edit costs are only considered changed if the changes are bigger than this number.
'verbose': 2 # whether to print out results.
}
# Parameters for graph kernel computation.
kernel_options = {'name': 'PathUpToH', # use path kernel up to length h.
'depth': 9,
'k_func': 'MinMax',
'compute_method': 'trie',
'parallel': 'imap_unordered', # or None
'n_jobs': multiprocessing.cpu_count(),
'normalize': True, # whether to use normalized Gram matrix to optimize edit costs.
'verbose': 2 # whether to print out results.
}
# Parameters for GED computation.
ged_options = {'method': 'BIPARTITE', # use Bipartite huristic.
'initialization_method': 'RANDOM', # or 'NODE', etc.
'initial_solutions': 10, # when bigger than 1, then the method is considered mIPFP.
'edit_cost': 'CONSTANT', # use CONSTANT cost.
'attr_distance': 'euclidean', # the distance between non-symbolic node/edge labels is computed by euclidean distance.
'ratio_runs_from_initial_solutions': 1,
'threads': multiprocessing.cpu_count(), # parallel threads. Do not work if mpg_options['parallel'] = False.
'init_option': 'LAZY_WITHOUT_SHUFFLED_COPIES' # 'EAGER_WITHOUT_SHUFFLED_COPIES'
}
# Parameters for MedianGraphEstimator (Boria's method).
mge_options = {'init_type': 'MEDOID', # how to initial median (compute set-median). "MEDOID" is to use the graph with smallest SOD.
'random_inits': 10, # number of random initialization when 'init_type' = 'RANDOM'.
'time_limit': 600, # maximum time limit to compute the generalized median. If set to 0 then no limit.
'verbose': 2, # whether to print out results.
'refine': False # whether to refine the final SODs or not.
}
print('done.')

"""**3. Run median preimage generator.**"""

from gklearn.preimage import MedianPreimageGeneratorPy

# Create median preimage generator instance.
mpg = MedianPreimageGeneratorPy()
# Add dataset.
mpg.dataset = dataset
# Set parameters.
mpg.set_options(**mpg_options.copy())
mpg.kernel_options = kernel_options.copy()
mpg.ged_options = ged_options.copy()
mpg.mge_options = mge_options.copy()
# Run.
mpg.run()

"""**4. Get results.**"""

# Get results.
import pprint
pp = pprint.PrettyPrinter(indent=4) # pretty print
results = mpg.get_results()
pp.pprint(results)

# Draw generated graphs.
def draw_graph(graph):
import matplotlib.pyplot as plt
import networkx as nx
plt.figure()
pos = nx.spring_layout(graph)
nx.draw(graph, pos, node_size=500, labels=nx.get_node_attributes(graph, 'atom_symbol'), font_color='w', width=3, with_labels=True)
plt.show()
plt.clf()
plt.close()
draw_graph(mpg.set_median)
draw_graph(mpg.gen_median)

+ 64
- 0
gklearn/experiments/papers/PRL_2020/runtimes_28cores.py View File

@@ -0,0 +1,64 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 21 10:34:26 2020

@author: ljia
"""
from utils import Graph_Kernel_List, Dataset_List, compute_graph_kernel
from gklearn.utils.graphdataset import load_predefined_dataset
import logging


# def get_graphs(ds_name):
# from gklearn.utils.graph_synthesizer import GraphSynthesizer
# gsyzer = GraphSynthesizer()
# graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=num_nodes, num_edges=int(num_nodes*2), num_node_labels=0, num_edge_labels=0, seed=None, directed=False)
# return graphs


def xp_runtimes_of_all_7cores():
# Run and save.
import pickle
import os
save_dir = 'outputs/runtimes_of_all_7cores/'
if not os.path.exists(save_dir):
os.makedirs(save_dir)

run_times = {}
for kernel_name in Graph_Kernel_List:
print()
print('Kernel:', kernel_name)
run_times[kernel_name] = []
for ds_name in Dataset_List:
print()
print('Dataset:', ds_name)
# get graphs.
graphs, _ = load_predefined_dataset(ds_name)

# Compute Gram matrix.
try:
gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, n_jobs=28)
run_times[kernel_name].append(run_time)
except Exception as exp:
run_times[kernel_name].append('error')
print('An exception occured when running this experiment:')
LOG_FILENAME = save_dir + 'error.txt'
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logging.exception('')
print(repr(exp))
pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + ds_name + '.pkl', 'wb'))
# Save all.
pickle.dump(run_times, open(save_dir + 'run_times.pkl', 'wb'))
return


if __name__ == '__main__':
xp_runtimes_of_all_7cores()

+ 13
- 3
gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py View File

@@ -6,6 +6,7 @@ Created on Mon Sep 21 10:34:26 2020
@author: ljia
"""
from utils import Graph_Kernel_List, compute_graph_kernel
import logging


def generate_graphs():
@@ -39,10 +40,19 @@ def xp_synthesied_graphs_dataset_size():
print('Number of graphs:', num_graphs)
sub_graphs = [g.copy() for g in graphs[0:num_graphs]]
gram_matrix, run_time = compute_graph_kernel(sub_graphs, kernel_name)
run_times[kernel_name].append(run_time)
pickle.dump(run_times, open(save_dir + 'run_time.' + kernel_name + '.' + str(num_graphs) + '.pkl', 'wb'))
try:
gram_matrix, run_time = compute_graph_kernel(sub_graphs, kernel_name, n_jobs=1)
run_times[kernel_name].append(run_time)
except Exception as exp:
run_times[kernel_name].append('error')
print('An exception occured when running this experiment:')
LOG_FILENAME = save_dir + 'error.txt'
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logging.exception('')
print(repr(exp))
pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + str(num_graphs) + '.pkl', 'wb'))
# Save all.
pickle.dump(run_times, open(save_dir + 'run_times.pkl', 'wb'))


+ 12
- 3
gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py View File

@@ -6,6 +6,7 @@ Created on Mon Sep 21 10:34:26 2020
@author: ljia
"""
from utils import Graph_Kernel_List, compute_graph_kernel
import logging


def generate_graphs(degree):
@@ -39,10 +40,18 @@ def xp_synthesied_graphs_degrees():
graphs = generate_graphs(degree)

# Compute Gram matrix.
gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name)
run_times[kernel_name].append(run_time)
try:
gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, n_jobs=1)
run_times[kernel_name].append(run_time)
except Exception as exp:
run_times[kernel_name].append('error')
print('An exception occured when running this experiment:')
LOG_FILENAME = save_dir + 'error.txt'
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logging.exception('')
print(repr(exp))
pickle.dump(run_times, open(save_dir + 'run_time.' + kernel_name + '.' + str(degree) + '.pkl', 'wb'))
pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + str(degree) + '.pkl', 'wb'))
# Save all.
pickle.dump(run_times, open(save_dir + 'run_times.pkl', 'wb'))


+ 12
- 3
gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py View File

@@ -6,6 +6,7 @@ Created on Mon Sep 21 10:34:26 2020
@author: ljia
"""
from utils import Graph_Kernel_List_ESym, compute_graph_kernel
import logging


def generate_graphs(num_el_alp):
@@ -39,10 +40,18 @@ def xp_synthesied_graphs_num_edge_label_alphabet():
graphs = generate_graphs(num_el_alp)

# Compute Gram matrix.
gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name)
run_times[kernel_name].append(run_time)
try:
gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, n_jobs=1)
run_times[kernel_name].append(run_time)
except Exception as exp:
run_times[kernel_name].append('error')
print('An exception occured when running this experiment:')
LOG_FILENAME = save_dir + 'error.txt'
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logging.exception('')
print(repr(exp))
pickle.dump(run_times, open(save_dir + 'run_time.' + kernel_name + '.' + str(num_el_alp) + '.pkl', 'wb'))
pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + str(num_el_alp) + '.pkl', 'wb'))
# Save all.
pickle.dump(run_times, open(save_dir + 'run_times.pkl', 'wb'))


+ 12
- 3
gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py View File

@@ -6,6 +6,7 @@ Created on Mon Sep 21 10:34:26 2020
@author: ljia
"""
from utils import Graph_Kernel_List_VSym, compute_graph_kernel
import logging


def generate_graphs(num_nl_alp):
@@ -39,10 +40,18 @@ def xp_synthesied_graphs_num_node_label_alphabet():
graphs = generate_graphs(num_nl_alp)

# Compute Gram matrix.
gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name)
run_times[kernel_name].append(run_time)
try:
gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, n_jobs=1)
run_times[kernel_name].append(run_time)
except Exception as exp:
run_times[kernel_name].append('error')
print('An exception occured when running this experiment:')
LOG_FILENAME = save_dir + 'error.txt'
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logging.exception('')
print(repr(exp))
pickle.dump(run_times, open(save_dir + 'run_time.' + kernel_name + '.' + str(num_nl_alp) + '.pkl', 'wb'))
pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + str(num_nl_alp) + '.pkl', 'wb'))
# Save all.
pickle.dump(run_times, open(save_dir + 'run_times.pkl', 'wb'))


+ 12
- 3
gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py View File

@@ -6,6 +6,7 @@ Created on Mon Sep 21 10:34:26 2020
@author: ljia
"""
from utils import Graph_Kernel_List, compute_graph_kernel
import logging


def generate_graphs(num_nodes):
@@ -39,10 +40,18 @@ def xp_synthesied_graphs_num_nodes():
graphs = generate_graphs(num_nodes)

# Compute Gram matrix.
gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name)
run_times[kernel_name].append(run_time)
try:
gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, n_jobs=1)
run_times[kernel_name].append(run_time)
except Exception as exp:
run_times[kernel_name].append('error')
print('An exception occured when running this experiment:')
LOG_FILENAME = save_dir + 'error.txt'
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logging.exception('')
print(repr(exp))
pickle.dump(run_times, open(save_dir + 'run_time.' + kernel_name + '.' + str(num_nodes) + '.pkl', 'wb'))
pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + str(num_nodes) + '.pkl', 'wb'))
# Save all.
pickle.dump(run_times, open(save_dir + 'run_times.pkl', 'wb'))


+ 8
- 3
gklearn/experiments/papers/PRL_2020/utils.py View File

@@ -5,6 +5,9 @@ Created on Tue Sep 22 11:33:28 2020

@author: ljia
"""
import multiprocessing


Graph_Kernel_List = ['PathUpToH', 'WLSubtree', 'SylvesterEquation', 'Marginalized', 'ShortestPath', 'Treelet', 'ConjugateGradient', 'FixedPoint', 'SpectralDecomposition', 'StructuralSP', 'CommonWalk']
# Graph_Kernel_List = ['CommonWalk', 'Marginalized', 'SylvesterEquation', 'ConjugateGradient', 'FixedPoint', 'SpectralDecomposition', 'ShortestPath', 'StructuralSP', 'PathUpToH', 'Treelet', 'WLSubtree']

@@ -21,8 +24,10 @@ Graph_Kernel_List_VCon = ['ShortestPath', 'ConjugateGradient', 'FixedPoint', 'St
Graph_Kernel_List_ECon = ['ConjugateGradient', 'FixedPoint', 'StructuralSP']


def compute_graph_kernel(graphs, kernel_name):
import multiprocessing
Dataset_List = ['Alkane', 'Acyclic', 'MAO', 'PAH', 'MUTAG', 'Letter-med', 'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD']


def compute_graph_kernel(graphs, kernel_name, n_jobs=multiprocessing.cpu_count()):
if kernel_name == 'CommonWalk':
from gklearn.kernels.commonWalkKernel import commonwalkkernel
@@ -99,7 +104,7 @@ def compute_graph_kernel(graphs, kernel_name):
params = {'base_kernel': 'subtree', 'height': 5}
# params['parallel'] = None
params['n_jobs'] = multiprocessing.cpu_count()
params['n_jobs'] = n_jobs
params['verbose'] = True
results = estimator(graphs, **params)

+ 22
- 11
gklearn/kernels/common_walk.py View File

@@ -75,9 +75,9 @@ class CommonWalk(GraphKernel):
# compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
def init_worker(gn_toshare):
global G_gn
G_gn = gn_toshare
# def init_worker(gn_toshare):
# global G_gn
# G_gn = gn_toshare
# direct product graph method - exponential
if self.__compute_method == 'exp':
@@ -86,8 +86,8 @@ class CommonWalk(GraphKernel):
elif self.__compute_method == 'geo':
do_fun = self._wrapper_kernel_do_geo
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose)
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=_init_worker_gm,
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose)
return gram_matrix
@@ -130,10 +130,10 @@ class CommonWalk(GraphKernel):
# compute kernel list.
kernel_list = [None] * len(g_list)

def init_worker(g1_toshare, g_list_toshare):
global G_g1, G_g_list
G_g1 = g1_toshare
G_g_list = g_list_toshare
# def init_worker(g1_toshare, g_list_toshare):
# global G_g1, G_g_list
# G_g1 = g1_toshare
# G_g_list = g_list_toshare
# direct product graph method - exponential
if self.__compute_method == 'exp':
@@ -147,7 +147,7 @@ class CommonWalk(GraphKernel):
itr = range(len(g_list))
len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered',
init_worker=_init_worker_list, glbv=(g1, g_list), method='imap_unordered',
n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose)
return kernel_list
@@ -279,4 +279,15 @@ class CommonWalk(GraphKernel):
if len(self.__edge_labels) == 0 or (len(self.__edge_labels) == 1 and self.__edge_labels[0] == SpecialLabel.DUMMY):
for i in range(len(Gn)):
nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY)
self.__edge_labels = [SpecialLabel.DUMMY]
self.__edge_labels = [SpecialLabel.DUMMY]
def _init_worker_gm(gn_toshare):
global G_gn
G_gn = gn_toshare
def _init_worker_list(g1_toshare, g_list_toshare):
global G_g1, G_g_list
G_g1 = g1_toshare
G_g_list = g_list_toshare

+ 9
- 0
gklearn/utils/dataset.py View File

@@ -93,6 +93,9 @@ class Dataset(object):
elif ds_name == 'DD':
ds_file = current_path + '../../datasets/DD/DD_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'ENZYMES':
ds_file = current_path + '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'Fingerprint':
ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file)
@@ -117,6 +120,12 @@ class Dataset(object):
elif ds_name == 'MUTAG':
ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'NCI1':
ds_file = current_path + '../../datasets/NCI1/NCI1_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'NCI109':
ds_file = current_path + '../../datasets/NCI109/NCI109_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'PAH':
ds_file = current_path + '../../datasets/PAH/dataset.ds'
self.__graphs, self.__targets, label_names = load_dataset(ds_file)


+ 81
- 0
gklearn/utils/graphdataset.py View File

@@ -1,4 +1,6 @@
""" Obtain all kinds of attributes of a graph dataset.

This file is for old version of graphkit-learn.
"""


@@ -336,3 +338,82 @@ def get_dataset_attributes(Gn,
from collections import OrderedDict
return OrderedDict(
sorted(attrs.items(), key=lambda i: attr_names.index(i[0])))


def load_predefined_dataset(ds_name):
import os
from gklearn.utils.graphfiles import loadDataset
current_path = os.path.dirname(os.path.realpath(__file__)) + '/'
if ds_name == 'Acyclic':
ds_file = current_path + '../../datasets/Acyclic/dataset_bps.ds'
graphs, targets = loadDataset(ds_file)
elif ds_name == 'AIDS':
ds_file = current_path + '../../datasets/AIDS/AIDS_A.txt'
graphs, targets = loadDataset(ds_file)
elif ds_name == 'Alkane':
ds_file = current_path + '../../datasets/Alkane/dataset.ds'
fn_targets = current_path + '../../datasets/Alkane/dataset_boiling_point_names.txt'
graphs, targets = loadDataset(ds_file, filename_y=fn_targets)
elif ds_name == 'COIL-DEL':
ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt'
graphs, targets = loadDataset(ds_file)
elif ds_name == 'COIL-RAG':
ds_file = current_path + '../../datasets/COIL-RAG/COIL-RAG_A.txt'
graphs, targets = loadDataset(ds_file)
elif ds_name == 'COLORS-3':
ds_file = current_path + '../../datasets/COLORS-3/COLORS-3_A.txt'
graphs, targets = loadDataset(ds_file)
elif ds_name == 'Cuneiform':
ds_file = current_path + '../../datasets/Cuneiform/Cuneiform_A.txt'
graphs, targets = loadDataset(ds_file)
elif ds_name == 'DD':
ds_file = current_path + '../../datasets/DD/DD_A.txt'
graphs, targets = loadDataset(ds_file)
elif ds_name == 'ENZYMES':
ds_file = current_path + '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'
graphs, targets = loadDataset(ds_file)
elif ds_name == 'Fingerprint':
ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt'
graphs, targets = loadDataset(ds_file)
elif ds_name == 'FRANKENSTEIN':
ds_file = current_path + '../../datasets/FRANKENSTEIN/FRANKENSTEIN_A.txt'
graphs, targets = loadDataset(ds_file)
elif ds_name == 'Letter-high': # node non-symb
ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt'
graphs, targets = loadDataset(ds_file)
elif ds_name == 'Letter-low': # node non-symb
ds_file = current_path + '../../datasets/Letter-low/Letter-low_A.txt'
graphs, targets = loadDataset(ds_file)
elif ds_name == 'Letter-med': # node non-symb
ds_file = current_path + '../../datasets/Letter-med/Letter-med_A.txt'
graphs, targets = loadDataset(ds_file)
elif ds_name == 'MAO':
ds_file = current_path + '../../datasets/MAO/dataset.ds'
graphs, targets = loadDataset(ds_file)
elif ds_name == 'Monoterpenoides':
ds_file = current_path + '../../datasets/Monoterpenoides/dataset_10+.ds'
graphs, targets = loadDataset(ds_file)
elif ds_name == 'MUTAG':
ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt'
graphs, targets = loadDataset(ds_file)
elif ds_name == 'NCI1':
ds_file = current_path + '../../datasets/NCI1/NCI1_A.txt'
graphs, targets = loadDataset(ds_file)
elif ds_name == 'NCI109':
ds_file = current_path + '../../datasets/NCI109/NCI109_A.txt'
graphs, targets = loadDataset(ds_file)
elif ds_name == 'PAH':
ds_file = current_path + '../../datasets/PAH/dataset.ds'
graphs, targets = loadDataset(ds_file)
elif ds_name == 'SYNTHETIC':
pass
elif ds_name == 'SYNTHETICnew':
ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt'
graphs, targets = loadDataset(ds_file)
elif ds_name == 'Synthie':
pass
else:
raise Exception('The dataset name "', ds_name, '" is not pre-defined.')
return graphs, targets

+ 17
- 16
gklearn/utils/model_selection_precomputed.py View File

@@ -30,6 +30,7 @@ def model_selection_for_precomputed_kernel(datafile,
datafile_y=None,
extra_params=None,
ds_name='ds-unknown',
output_dir='outputs/',
n_jobs=1,
read_gm_from_file=False,
verbose=True):
@@ -56,7 +57,7 @@ def model_selection_for_precomputed_kernel(datafile,
model_type : string
Type of the problem, can be 'regression' or 'classification'.
NUM_TRIALS : integer
Number of random trials of outer cv loop. The default is 30.
Number of random trials of the outer CV loop. The default is 30.
datafile_y : string
Path of file storing y data. This parameter is optional depending on
the given dataset file.
@@ -89,9 +90,9 @@ def model_selection_for_precomputed_kernel(datafile,
"""
tqdm.monitor_interval = 0

results_dir = '../notebooks/results/' + estimator.__name__
if not os.path.exists(results_dir):
os.makedirs(results_dir)
output_dir += estimator.__name__
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# a string to save all the results.
str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n'
str_fw += '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n'
@@ -209,7 +210,7 @@ def model_selection_for_precomputed_kernel(datafile,
# threshold=np.inf,
# floatmode='unique') + '\n\n'

fig_file_name = results_dir + '/GM[ds]' + ds_name
fig_file_name = output_dir + '/GM[ds]' + ds_name
if params_out != {}:
fig_file_name += '[params]' + str(idx)
plt.imshow(Kmatrix)
@@ -244,7 +245,7 @@ def model_selection_for_precomputed_kernel(datafile,
str_fw += '\nall gram matrices are ignored, no results obtained.\n\n'
else:
# save gram matrices to file.
# np.savez(results_dir + '/' + ds_name + '.gm',
# np.savez(output_dir + '/' + ds_name + '.gm',
# gms=gram_matrices, params=param_list_pre_revised, y=y,
# gmtime=gram_matrix_time)
if verbose:
@@ -450,7 +451,7 @@ def model_selection_for_precomputed_kernel(datafile,
print()
print('2. Reading gram matrices from file...')
str_fw += '\nII. Gram matrices.\n\nGram matrices are read from file, see last log for detail.\n'
gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz')
gmfile = np.load(output_dir + '/' + ds_name + '.gm.npz')
gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed
gram_matrix_time = gmfile['gmtime'] # time used to compute the gram matrices
param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones
@@ -603,8 +604,8 @@ def model_selection_for_precomputed_kernel(datafile,
str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster)

# open file to save all results for this dataset.
if not os.path.exists(results_dir):
os.makedirs(results_dir)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# print out results as table.
str_fw += printResultsInTable(param_list, param_list_pre_revised, average_val_scores,
@@ -613,11 +614,11 @@ def model_selection_for_precomputed_kernel(datafile,
model_type, verbose)
# open file to save all results for this dataset.
if not os.path.exists(results_dir + '/' + ds_name + '.output.txt'):
with open(results_dir + '/' + ds_name + '.output.txt', 'w') as f:
if not os.path.exists(output_dir + '/' + ds_name + '.output.txt'):
with open(output_dir + '/' + ds_name + '.output.txt', 'w') as f:
f.write(str_fw)
else:
with open(results_dir + '/' + ds_name + '.output.txt', 'r+') as f:
with open(output_dir + '/' + ds_name + '.output.txt', 'r+') as f:
content = f.read()
f.seek(0, 0)
f.write(str_fw + '\n\n\n' + content)
@@ -797,7 +798,7 @@ def parallel_trial_do(param_list_pre_revised, param_list, y, model_type, trial):


def compute_gram_matrices(dataset, y, estimator, param_list_precomputed,
results_dir, ds_name,
output_dir, ds_name,
n_jobs=1, str_fw='', verbose=True):
gram_matrices = [
] # a list to store gram matrices for all param_grid_precomputed
@@ -867,7 +868,7 @@ def compute_gram_matrices(dataset, y, estimator, param_list_precomputed,
# threshold=np.inf,
# floatmode='unique') + '\n\n'

fig_file_name = results_dir + '/GM[ds]' + ds_name
fig_file_name = output_dir + '/GM[ds]' + ds_name
if params_out != {}:
fig_file_name += '[params]' + str(idx)
plt.imshow(Kmatrix)
@@ -897,8 +898,8 @@ def compute_gram_matrices(dataset, y, estimator, param_list_precomputed,
return gram_matrices, gram_matrix_time, param_list_pre_revised, y, str_fw


def read_gram_matrices_from_file(results_dir, ds_name):
gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz')
def read_gram_matrices_from_file(output_dir, ds_name):
gmfile = np.load(output_dir + '/' + ds_name + '.gm.npz')
gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed
param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones
y = gmfile['y'].tolist()


+ 1
- 1
requirements_pypi.txt View File

@@ -1,6 +1,6 @@
numpy>=1.16.2
scipy>=1.1.0
matplotlib>=3.1.0
matplotlib>=3.0.0
networkx>=2.2
scikit-learn>=0.20.0
tabulate>=0.8.2


+ 24
- 17
setup.py View File

@@ -1,25 +1,32 @@
import setuptools

with open("README.md", "r") as fh:
long_description = fh.read()
long_description = fh.read()

with open('requirements_pypi.txt') as fp:
install_requires = fp.read()
install_requires = fp.read()

setuptools.setup(
name="graphkit-learn",
version="0.2b2",
author="Linlin Jia",
author_email="linlin.jia@insa-rouen.fr",
description="A Python library for graph kernels, graph edit distances, and graph pre-images",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/jajupmochi/graphkit-learn",
packages=setuptools.find_packages(),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent",
],
install_requires=install_requires,
name="graphkit-learn",
version="0.2b4",
author="Linlin Jia",
author_email="linlin.jia@insa-rouen.fr",
description="A Python library for graph kernels, graph edit distances, and graph pre-images",
long_description=long_description,
long_description_content_type="text/markdown",
project_urls={
'Documentation': 'https://graphkit-learn.readthedocs.io',
'Source': 'https://github.com/jajupmochi/graphkit-learn',
'Tracker': 'https://github.com/jajupmochi/graphkit-learn/issues',
},
url="https://github.com/jajupmochi/graphkit-learn",
packages=setuptools.find_packages(),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent",
'Intended Audience :: Science/Research',
'Intended Audience :: Developers',
],
install_requires=install_requires,
)

Loading…
Cancel
Save