Browse Source

Merge pull request #17 from jajupmochi/v0.2

V0.2
tags/v0.2.0
linlin GitHub 5 years ago
parent
commit
eadc91b9df
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 4897 additions and 2157 deletions
  1. +28
    -0
      .appveyor.yml
  2. +1
    -0
      README.md
  3. +2
    -1
      gklearn/ged/env/__init__.py
  4. +80
    -0
      gklearn/ged/env/node_map.py
  5. +642
    -143
      gklearn/ged/median/median_graph_estimator.py
  6. +87
    -9
      gklearn/ged/median/test_median_graph_estimator.py
  7. +2
    -0
      gklearn/ged/median/utils.py
  8. +2855
    -1747
      gklearn/gedlib/gedlibpy.cpp
  9. BIN
      gklearn/gedlib/gedlibpy.cpython-36m-x86_64-linux-gnu.so
  10. +44
    -19
      gklearn/gedlib/gedlibpy.pyx
  11. +3
    -2
      gklearn/gedlib/src/GedLibBind.hpp
  12. +89
    -54
      gklearn/gedlib/src/GedLibBind.ipp
  13. +1
    -1
      gklearn/kernels/__init__.py
  14. +9
    -8
      gklearn/kernels/path_up_to_h.py
  15. +9
    -8
      gklearn/kernels/treelet.py
  16. +25
    -7
      gklearn/kernels/weisfeiler_lehman.py
  17. +801
    -56
      gklearn/preimage/experiments/xp_median_preimage.py
  18. +21
    -47
      gklearn/preimage/median_preimage_generator.py
  19. +30
    -7
      gklearn/preimage/utils.py
  20. +6
    -6
      gklearn/tests/test_graph_kernels.py
  21. +1
    -0
      gklearn/utils/__init__.py
  22. +102
    -30
      gklearn/utils/dataset.py
  23. +6
    -6
      gklearn/utils/graph_files.py
  24. +53
    -6
      gklearn/utils/utils.py

+ 28
- 0
.appveyor.yml View File

@@ -0,0 +1,28 @@
environment:
matrix:
- PYTHON: "C:\\Python35"
- PYTHON: "C:\\Python35-x64"
- PYTHON: "C:\\Python36"
- PYTHON: "C:\\Python36-x64"
- PYTHON: "C:\\Python37"
- PYTHON: "C:\\Python37-x64"
- PYTHON: "C:\\Python38"
- PYTHON: "C:\\Python38-x64"

# skip_commits:
# files:
# - "*.yml"
# - "*.rst"
# - "LICENSE"

install:
- "%PYTHON%\\python.exe -m pip install -U pip"
- "%PYTHON%\\python.exe -m pip install -U pytest"
- "%PYTHON%\\python.exe -m pip install -r requirements.txt"
- "%PYTHON%\\python.exe -m pip install wheel"

build: off

test_script:
- "%PYTHON%\\python.exe setup.py bdist_wheel"
- "%PYTHON%\\python.exe -m pytest -v gklearn/tests/"

+ 1
- 0
README.md View File

@@ -1,5 +1,6 @@
# graphkit-learn
[![Build Status](https://travis-ci.org/jajupmochi/graphkit-learn.svg?branch=master)](https://travis-ci.org/jajupmochi/graphkit-learn)
[![Build status](https://ci.appveyor.com/api/projects/status/bdxsolk0t1uji9rd?svg=true)](https://ci.appveyor.com/project/jajupmochi/graphkit-learn)
[![codecov](https://codecov.io/gh/jajupmochi/graphkit-learn/branch/master/graph/badge.svg)](https://codecov.io/gh/jajupmochi/graphkit-learn)
[![Documentation Status](https://readthedocs.org/projects/graphkit-learn/badge/?version=master)](https://graphkit-learn.readthedocs.io/en/master/?badge=master)
[![PyPI version](https://badge.fury.io/py/graphkit-learn.svg)](https://badge.fury.io/py/graphkit-learn)


+ 2
- 1
gklearn/ged/env/__init__.py View File

@@ -1 +1,2 @@
from gklearn.ged.env.common_types import AlgorithmState
from gklearn.ged.env.common_types import AlgorithmState
from gklearn.ged.env.node_map import NodeMap

+ 80
- 0
gklearn/ged/env/node_map.py View File

@@ -0,0 +1,80 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 22 11:31:26 2020

@author: ljia
"""
import numpy as np

class NodeMap(object):
def __init__(self, num_nodes_g, num_nodes_h):
self.__forward_map = [np.inf] * num_nodes_g
self.__backward_map = [np.inf] * num_nodes_h
self.__induced_cost = np.inf
def num_source_nodes(self):
return len(self.__forward_map)
def num_target_nodes(self):
return len(self.__backward_map)
def image(self, node):
if node < len(self.__forward_map):
return self.__forward_map[node]
else:
raise Exception('The node with ID ', str(node), ' is not contained in the source nodes of the node map.')
return np.inf
def pre_image(self, node):
if node < len(self.__backward_map):
return self.__backward_map[node]
else:
raise Exception('The node with ID ', str(node), ' is not contained in the target nodes of the node map.')
return np.inf
def get_forward_map(self):
return self.__forward_map
def get_backward_map(self):
return self.__backward_map
def as_relation(self, relation):
relation.clear()
for i in range(0, len(self.__forward_map)):
k = self.__forward_map[i]
if k != np.inf:
relation.append(tuple((i, k)))
for k in range(0, len(self.__backward_map)):
i = self.__backward_map[k]
if i == np.inf:
relation.append(tuple((i, k)))
def add_assignment(self, i, k):
if i != np.inf:
if i < len(self.__forward_map):
self.__forward_map[i] = k
else:
raise Exception('The node with ID ', str(i), ' is not contained in the source nodes of the node map.')
if k != np.inf:
if k < len(self.__backward_map):
self.__backward_map[k] = i
else:
raise Exception('The node with ID ', str(k), ' is not contained in the target nodes of the node map.')
def set_induced_cost(self, induced_cost):
self.__induced_cost = induced_cost
def induced_cost(self):
return self.__induced_cost

+ 642
- 143
gklearn/ged/median/median_graph_estimator.py
File diff suppressed because it is too large
View File


+ 87
- 9
gklearn/ged/median/test_median_graph_estimator.py View File

@@ -7,11 +7,10 @@ Created on Mon Mar 16 17:26:40 2020
"""
def test_median_graph_estimator():
from gklearn.utils.graphfiles import loadDataset
from gklearn.utils import load_dataset
from gklearn.ged.median import MedianGraphEstimator, constant_node_costs
from gklearn.gedlib import librariesImport, gedlibpy
from gklearn.preimage.utils import get_same_item_indices
from gklearn.preimage.ged import convertGraph
import multiprocessing

# estimator parameters.
@@ -22,17 +21,17 @@ def test_median_graph_estimator():
# algorithm parameters.
algo = 'IPFP'
initial_solutions = 40
algo_options_suffix = ' --initial-solutions ' + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1'
initial_solutions = 1
algo_options_suffix = ' --initial-solutions ' + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1 --initialization-method NODE '

edit_cost_name = 'LETTER2'
edit_cost_constants = [0.02987291, 0.0178211, 0.01431966, 0.001, 0.001]
ds_name = 'COIL-DEL'
ds_name = 'Letter_high'
# Load dataset.
# dataset = '../../datasets/COIL-DEL/COIL-DEL_A.txt'
dataset = '../../../datasets/Letter-high/Letter-high_A.txt'
Gn, y_all = loadDataset(dataset)
Gn, y_all, label_names = load_dataset(dataset)
y_idx = get_same_item_indices(y_all)
for i, (y, values) in enumerate(y_idx.items()):
Gn_i = [Gn[val] for val in values]
@@ -43,7 +42,7 @@ def test_median_graph_estimator():
# gedlibpy.restart_env()
ged_env.set_edit_cost(edit_cost_name, edit_cost_constant=edit_cost_constants)
for G in Gn_i:
ged_env.add_nx_graph(convertGraph(G, edit_cost_name), '')
ged_env.add_nx_graph(G, '')
graph_ids = ged_env.get_all_graph_ids()
set_median_id = ged_env.add_graph('set_median')
gen_median_id = ged_env.add_graph('gen_median')
@@ -54,11 +53,89 @@ def test_median_graph_estimator():
mge.set_refine_method(algo, '--threads ' + str(threads) + ' --initial-solutions ' + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1')
mge_options = '--time-limit ' + str(time_limit) + ' --stdout 2 --init-type ' + init_type
mge_options += ' --random-inits ' + str(num_inits) + ' --seed ' + '1' + ' --refine FALSE'# @todo: std::to_string(rng())
mge_options += ' --random-inits ' + str(num_inits) + ' --seed ' + '1' + ' --update-order TRUE --refine FALSE --randomness PSEUDO '# @todo: std::to_string(rng())
# Select the GED algorithm.
algo_options = '--threads ' + str(threads) + algo_options_suffix
mge.set_options(mge_options)
mge.set_label_names(node_labels=label_names['node_labels'],
edge_labels=label_names['edge_labels'],
node_attrs=label_names['node_attrs'],
edge_attrs=label_names['edge_attrs'])
mge.set_init_method(algo, algo_options)
mge.set_descent_method(algo, algo_options)
# Run the estimator.
mge.run(graph_ids, set_median_id, gen_median_id)
# Get SODs.
sod_sm = mge.get_sum_of_distances('initialized')
sod_gm = mge.get_sum_of_distances('converged')
print('sod_sm, sod_gm: ', sod_sm, sod_gm)
# Get median graphs.
set_median = ged_env.get_nx_graph(set_median_id)
gen_median = ged_env.get_nx_graph(gen_median_id)
return set_median, gen_median


def test_median_graph_estimator_symb():
from gklearn.utils import load_dataset
from gklearn.ged.median import MedianGraphEstimator, constant_node_costs
from gklearn.gedlib import librariesImport, gedlibpy
from gklearn.preimage.utils import get_same_item_indices
import multiprocessing

# estimator parameters.
init_type = 'MEDOID'
num_inits = 1
threads = multiprocessing.cpu_count()
time_limit = 60000
# algorithm parameters.
algo = 'IPFP'
initial_solutions = 1
algo_options_suffix = ' --initial-solutions ' + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1 --initialization-method NODE '

edit_cost_name = 'CONSTANT'
edit_cost_constants = [4, 4, 2, 1, 1, 1]
ds_name = 'MUTAG'
# Load dataset.
dataset = '../../../datasets/MUTAG/MUTAG_A.txt'
Gn, y_all, label_names = load_dataset(dataset)
y_idx = get_same_item_indices(y_all)
for i, (y, values) in enumerate(y_idx.items()):
Gn_i = [Gn[val] for val in values]
break
Gn_i = Gn_i[0:10]
# Set up the environment.
ged_env = gedlibpy.GEDEnv()
# gedlibpy.restart_env()
ged_env.set_edit_cost(edit_cost_name, edit_cost_constant=edit_cost_constants)
for G in Gn_i:
ged_env.add_nx_graph(G, '')
graph_ids = ged_env.get_all_graph_ids()
set_median_id = ged_env.add_graph('set_median')
gen_median_id = ged_env.add_graph('gen_median')
ged_env.init(init_option='EAGER_WITHOUT_SHUFFLED_COPIES')
# Set up the estimator.
mge = MedianGraphEstimator(ged_env, constant_node_costs(edit_cost_name))
mge.set_refine_method(algo, '--threads ' + str(threads) + ' --initial-solutions ' + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1')
mge_options = '--time-limit ' + str(time_limit) + ' --stdout 2 --init-type ' + init_type
mge_options += ' --random-inits ' + str(num_inits) + ' --seed ' + '1' + ' --update-order TRUE --refine FALSE'# @todo: std::to_string(rng())
# Select the GED algorithm.
algo_options = '--threads ' + str(threads) + algo_options_suffix
mge.set_options(mge_options)
mge.set_label_names(node_labels=label_names['node_labels'],
edge_labels=label_names['edge_labels'],
node_attrs=label_names['node_attrs'],
edge_attrs=label_names['edge_attrs'])
mge.set_init_method(algo, algo_options)
mge.set_descent_method(algo, algo_options)
@@ -78,4 +155,5 @@ def test_median_graph_estimator():


if __name__ == '__main__':
set_median, gen_median = test_median_graph_estimator()
set_median, gen_median = test_median_graph_estimator()
# set_median, gen_median = test_median_graph_estimator_symb()

+ 2
- 0
gklearn/ged/median/utils.py View File

@@ -30,6 +30,8 @@ def mge_options_to_string(options):
opt_str += '--randomness ' + str(val) + ' '
elif key == 'verbose':
opt_str += '--stdout ' + str(val) + ' '
elif key == 'update_order':
opt_str += '--update-order ' + ('TRUE' if val else 'FALSE') + ' '
elif key == 'refine':
opt_str += '--refine ' + ('TRUE' if val else 'FALSE') + ' '
elif key == 'time_limit':


+ 2855
- 1747
gklearn/gedlib/gedlibpy.cpp
File diff suppressed because it is too large
View File


BIN
gklearn/gedlib/gedlibpy.cpython-36m-x86_64-linux-gnu.so View File


+ 44
- 19
gklearn/gedlib/gedlibpy.pyx View File

@@ -35,8 +35,8 @@ from libcpp.pair cimport pair
from libcpp.list cimport list

#Long unsigned int equivalent
cimport numpy as np
ctypedef np.npy_uint32 UINT32_t
cimport numpy as cnp
ctypedef cnp.npy_uint32 UINT32_t
from cpython cimport array

@@ -76,14 +76,14 @@ cdef extern from "src/GedLibBind.hpp" namespace "pyged":
void runMethod(size_t g, size_t h) except +
double getUpperBound(size_t g, size_t h) except +
double getLowerBound(size_t g, size_t h) except +
vector[np.npy_uint64] getForwardMap(size_t g, size_t h) except +
vector[np.npy_uint64] getBackwardMap(size_t g, size_t h) except +
vector[cnp.npy_uint64] getForwardMap(size_t g, size_t h) except +
vector[cnp.npy_uint64] getBackwardMap(size_t g, size_t h) except +
size_t getNodeImage(size_t g, size_t h, size_t nodeId) except +
size_t getNodePreImage(size_t g, size_t h, size_t nodeId) except +
double getInducedCost(size_t g, size_t h) except +
vector[pair[size_t,size_t]] getNodeMap(size_t g, size_t h) except +
vector[vector[int]] getAssignmentMatrix(size_t g, size_t h) except +
vector[vector[np.npy_uint64]] getAllMap(size_t g, size_t h) except +
vector[vector[cnp.npy_uint64]] getAllMap(size_t g, size_t h) except +
double getRuntime(size_t g, size_t h) except +
bool quasimetricCosts() except +
vector[vector[size_t]] hungarianLSAP(vector[vector[size_t]] matrixCost) except +
@@ -105,14 +105,16 @@ cdef extern from "src/GedLibBind.hpp" namespace "pyged":
map[string, string] getMedianEdgeLabel(vector[map[string, string]] & edge_labels) except +
string getInitType() except +
# double getNodeCost(size_t label1, size_t label2) except +
void computeInducedCost(size_t g_id, size_t h_id) except +
double computeInducedCost(size_t g_id, size_t h_id, vector[pair[size_t,size_t]]) except +
#############################
##CYTHON WRAPPER INTERFACES##
#############################

import numpy as np
import networkx as nx
from gklearn.ged.env import NodeMap

# import librariesImport
from ctypes import *
@@ -726,13 +728,30 @@ cdef class GEDEnv:
:type g: size_t
:type h: size_t
:return: The Node Map between the two selected graph.
:rtype: list[tuple(size_t, size_t)]
:rtype: gklearn.ged.env.NodeMap.
.. seealso:: run_method(), get_forward_map(), get_backward_map(), get_node_image(), get_node_pre_image(), get_assignment_matrix()
.. warning:: run_method() between the same two graph must be called before this function.
.. note:: This function creates datas so use it if necessary, however you can understand how assignement works with this example.
"""
return self.c_env.getNodeMap(g, h)
map_as_relation = self.c_env.getNodeMap(g, h)
induced_cost = self.c_env.getInducedCost(g, h) # @todo: the C++ implementation for this function in GedLibBind.ipp re-call get_node_map() once more, this is not neccessary.
source_map = [item.first if item.first < len(map_as_relation) else np.inf for item in map_as_relation] # item.first < len(map_as_relation) is not exactly correct.
# print(source_map)
target_map = [item.second if item.second < len(map_as_relation) else np.inf for item in map_as_relation]
# print(target_map)
num_node_source = len([item for item in source_map if item != np.inf])
# print(num_node_source)
num_node_target = len([item for item in target_map if item != np.inf])
# print(num_node_target)
node_map = NodeMap(num_node_source, num_node_target)
# print(node_map.get_forward_map(), node_map.get_backward_map())
for i in range(len(source_map)):
node_map.add_assignment(source_map[i], target_map[i])
node_map.set_induced_cost(induced_cost)
return node_map
def get_assignment_matrix(self, g, h) :
@@ -1320,7 +1339,7 @@ cdef class GEDEnv:
return graph_id
def compute_induced_cost(self, g_id, h_id):
def compute_induced_cost(self, g_id, h_id, node_map):
"""
Computes the edit cost between two graphs induced by a node map.

@@ -1330,19 +1349,25 @@ cdef class GEDEnv:
ID of input graph.
h_id : int
ID of input graph.
node_map: gklearn.ged.env.NodeMap.
The NodeMap instance whose reduced cost will be computed and re-assigned.

Returns
-------
None.
Notes
-----
The induced edit cost of the node map between `g_id` and `h_id` is implictly computed and stored in `GEDEnv::node_maps_`.

"""
cost = 0.0
self.c_env.computeInducedCost(g_id, h_id)
None.
"""
relation = []
node_map.as_relation(relation)
# print(relation)
dummy_node = get_dummy_node()
# print(dummy_node)
for i, val in enumerate(relation):
val1 = dummy_node if val[0] == np.inf else val[0]
val2 = dummy_node if val[1] == np.inf else val[1]
relation[i] = tuple((val1, val2))
# print(relation)
induced_cost = self.c_env.computeInducedCost(g_id, h_id, relation)
node_map.set_induced_cost(induced_cost)

#####################################################################


+ 3
- 2
gklearn/gedlib/src/GedLibBind.hpp View File

@@ -475,8 +475,9 @@ public:
* @brief Computes the edit cost between two graphs induced by a node map.
* @param[in] g_id ID of input graph.
* @param[in] h_id ID of input graph.
* @return Computed induced cost.
*/
void computeInducedCost(std::size_t g_id, std::size_t h_id) const;
double computeInducedCost(std::size_t g_id, std::size_t h_id, std::vector<pair<std::size_t, std::size_t>> relation) const;

// /*!
// * @brief Returns node relabeling, insertion, or deletion cost.
@@ -492,7 +493,7 @@ public:

private:

ged::GEDEnv<ged::GXLNodeID, ged::GXLLabel, ged::GXLLabel> env; // environment variable
ged::GEDEnv<ged::GXLNodeID, ged::GXLLabel, ged::GXLLabel> * env_; // environment variable

bool initialized; // initialization boolean (because env has one but not accessible)



+ 89
- 54
gklearn/gedlib/src/GedLibBind.ipp View File

@@ -277,11 +277,16 @@ std::string toStringVectorInt(std::vector<unsigned long int> vector) {


PyGEDEnv::PyGEDEnv () {
this->env = ged::GEDEnv<ged::GXLNodeID, ged::GXLLabel, ged::GXLLabel>();
env_ = new ged::GEDEnv<ged::GXLNodeID, ged::GXLLabel, ged::GXLLabel>();
this->initialized = false;
}

PyGEDEnv::~PyGEDEnv () {}
PyGEDEnv::~PyGEDEnv () {
if (env_ != NULL) {
delete env_;
env_ = NULL;
}
}

// bool initialized = false; //Initialization boolean (because Env has one but not accessible).

@@ -290,64 +295,68 @@ bool PyGEDEnv::isInitialized() {
}

void PyGEDEnv::restartEnv() {
this->env = ged::GEDEnv<ged::GXLNodeID, ged::GXLLabel, ged::GXLLabel>();
if (env_ != NULL) {
delete env_;
env_ = NULL;
}
env_ = new ged::GEDEnv<ged::GXLNodeID, ged::GXLLabel, ged::GXLLabel>();
initialized = false;
}

void PyGEDEnv::loadGXLGraph(const std::string & pathFolder, const std::string & pathXML, bool node_type, bool edge_type) {
std::vector<ged::GEDGraph::GraphID> tmp_graph_ids(this->env.load_gxl_graph(pathFolder, pathXML,
std::vector<ged::GEDGraph::GraphID> tmp_graph_ids(env_->load_gxl_graph(pathFolder, pathXML,
(node_type ? ged::Options::GXLNodeEdgeType::LABELED : ged::Options::GXLNodeEdgeType::UNLABELED),
(edge_type ? ged::Options::GXLNodeEdgeType::LABELED : ged::Options::GXLNodeEdgeType::UNLABELED),
std::unordered_set<std::string>(), std::unordered_set<std::string>()));
}

std::pair<std::size_t,std::size_t> PyGEDEnv::getGraphIds() const {
return this->env.graph_ids();
return env_->graph_ids();
}

std::vector<std::size_t> PyGEDEnv::getAllGraphIds() {
std::vector<std::size_t> listID;
for (std::size_t i = this->env.graph_ids().first; i != this->env.graph_ids().second; i++) {
for (std::size_t i = env_->graph_ids().first; i != env_->graph_ids().second; i++) {
listID.push_back(i);
}
return listID;
}

const std::string PyGEDEnv::getGraphClass(std::size_t id) const {
return this->env.get_graph_class(id);
return env_->get_graph_class(id);
}

const std::string PyGEDEnv::getGraphName(std::size_t id) const {
return this->env.get_graph_name(id);
return env_->get_graph_name(id);
}

std::size_t PyGEDEnv::addGraph(const std::string & graph_name, const std::string & graph_class) {
ged::GEDGraph::GraphID newId = this->env.add_graph(graph_name, graph_class);
ged::GEDGraph::GraphID newId = env_->add_graph(graph_name, graph_class);
initialized = false;
return std::stoi(std::to_string(newId));
}

void PyGEDEnv::addNode(std::size_t graphId, const std::string & nodeId, const std::map<std::string, std::string> & nodeLabel) {
this->env.add_node(graphId, nodeId, nodeLabel);
env_->add_node(graphId, nodeId, nodeLabel);
initialized = false;
}

/*void addEdge(std::size_t graphId, ged::GXLNodeID tail, ged::GXLNodeID head, ged::GXLLabel edgeLabel) {
this->env.add_edge(graphId, tail, head, edgeLabel);
env_->add_edge(graphId, tail, head, edgeLabel);
}*/

void PyGEDEnv::addEdge(std::size_t graphId, const std::string & tail, const std::string & head, const std::map<std::string, std::string> & edgeLabel, bool ignoreDuplicates) {
this->env.add_edge(graphId, tail, head, edgeLabel, ignoreDuplicates);
env_->add_edge(graphId, tail, head, edgeLabel, ignoreDuplicates);
initialized = false;
}

void PyGEDEnv::clearGraph(std::size_t graphId) {
this->env.clear_graph(graphId);
env_->clear_graph(graphId);
initialized = false;
}

ged::ExchangeGraph<ged::GXLNodeID, ged::GXLLabel, ged::GXLLabel> PyGEDEnv::getGraph(std::size_t graphId) const {
return this->env.get_graph(graphId);
return env_->get_graph(graphId);
}

std::size_t PyGEDEnv::getGraphInternalId(std::size_t graphId) {
@@ -379,71 +388,71 @@ std::vector<std::vector<std::size_t>> PyGEDEnv::getGraphAdjacenceMatrix(std::siz
}

void PyGEDEnv::setEditCost(std::string editCost, std::vector<double> editCostConstants) {
this->env.set_edit_costs(translateEditCost(editCost), editCostConstants);
env_->set_edit_costs(translateEditCost(editCost), editCostConstants);
}

void PyGEDEnv::setPersonalEditCost(std::vector<double> editCostConstants) {
//this->env.set_edit_costs(Your EditCost Class(editCostConstants));
//env_->set_edit_costs(Your EditCost Class(editCostConstants));
}

// void PyGEDEnv::initEnv() {
// this->env.init();
// env_->init();
// initialized = true;
// }

void PyGEDEnv::initEnv(std::string initOption, bool print_to_stdout) {
this->env.init(translateInitOptions(initOption), print_to_stdout);
env_->init(translateInitOptions(initOption), print_to_stdout);
initialized = true;
}

void PyGEDEnv::setMethod(std::string method, const std::string & options) {
this->env.set_method(translateMethod(method), options);
env_->set_method(translateMethod(method), options);
}

void PyGEDEnv::initMethod() {
this->env.init_method();
env_->init_method();
}

double PyGEDEnv::getInitime() const {
return this->env.get_init_time();
return env_->get_init_time();
}

void PyGEDEnv::runMethod(std::size_t g, std::size_t h) {
this->env.run_method(g, h);
env_->run_method(g, h);
}

double PyGEDEnv::getUpperBound(std::size_t g, std::size_t h) const {
return this->env.get_upper_bound(g, h);
return env_->get_upper_bound(g, h);
}

double PyGEDEnv::getLowerBound(std::size_t g, std::size_t h) const {
return this->env.get_lower_bound(g, h);
return env_->get_lower_bound(g, h);
}

std::vector<long unsigned int> PyGEDEnv::getForwardMap(std::size_t g, std::size_t h) const {
return this->env.get_node_map(g, h).get_forward_map();
return env_->get_node_map(g, h).get_forward_map();
}

std::vector<long unsigned int> PyGEDEnv::getBackwardMap(std::size_t g, std::size_t h) const {
return this->env.get_node_map(g, h).get_backward_map();
return env_->get_node_map(g, h).get_backward_map();
}

std::size_t PyGEDEnv::getNodeImage(std::size_t g, std::size_t h, std::size_t nodeId) const {
return this->env.get_node_map(g, h).image(nodeId);
return env_->get_node_map(g, h).image(nodeId);
}

std::size_t PyGEDEnv::getNodePreImage(std::size_t g, std::size_t h, std::size_t nodeId) const {
return this->env.get_node_map(g, h).pre_image(nodeId);
return env_->get_node_map(g, h).pre_image(nodeId);
}

double PyGEDEnv::getInducedCost(std::size_t g, std::size_t h) const {
return this->env.get_node_map(g, h).induced_cost();
return env_->get_node_map(g, h).induced_cost();
}

std::vector<pair<std::size_t, std::size_t>> PyGEDEnv::getNodeMap(std::size_t g, std::size_t h) {
std::vector<pair<std::size_t, std::size_t>> res;
std::vector<ged::NodeMap::Assignment> relation;
this->env.get_node_map(g, h).as_relation(relation);
env_->get_node_map(g, h).as_relation(relation);
for (const auto & assignment : relation) {
res.push_back(std::make_pair(assignment.first, assignment.second));
}
@@ -493,11 +502,11 @@ std::vector<std::vector<unsigned long int>> PyGEDEnv::getAllMap(std::size_t g, s
}

double PyGEDEnv::getRuntime(std::size_t g, std::size_t h) const {
return this->env.get_runtime(g, h);
return env_->get_runtime(g, h);
}

bool PyGEDEnv::quasimetricCosts() const {
return this->env.quasimetric_costs();
return env_->quasimetric_costs();
}

std::vector<std::vector<size_t>> PyGEDEnv::hungarianLSAP(std::vector<std::vector<std::size_t>> matrixCost) {
@@ -542,73 +551,99 @@ std::vector<std::vector<double>> PyGEDEnv::hungarianLSAPE(std::vector<std::vecto
}

std::size_t PyGEDEnv::getNumNodeLabels() const {
return this->env.num_node_labels();
return env_->num_node_labels();
}

std::map<std::string, std::string> PyGEDEnv::getNodeLabel(std::size_t label_id) const {
return this->env.get_node_label(label_id);
return env_->get_node_label(label_id);
}

std::size_t PyGEDEnv::getNumEdgeLabels() const {
return this->env.num_edge_labels();
return env_->num_edge_labels();
}

std::map<std::string, std::string> PyGEDEnv::getEdgeLabel(std::size_t label_id) const {
return this->env.get_edge_label(label_id);
return env_->get_edge_label(label_id);
}

// std::size_t PyGEDEnv::getNumNodes(std::size_t graph_id) const {
// return this->env.get_num_nodes(graph_id);
// return env_->get_num_nodes(graph_id);
// }

double PyGEDEnv::getAvgNumNodes() const {
return this->env.get_avg_num_nodes();
return env_->get_avg_num_nodes();
}

double PyGEDEnv::getNodeRelCost(const std::map<std::string, std::string> & node_label_1, const std::map<std::string, std::string> & node_label_2) const {
return this->env.node_rel_cost(node_label_1, node_label_2);
return env_->node_rel_cost(node_label_1, node_label_2);
}

double PyGEDEnv::getNodeDelCost(const std::map<std::string, std::string> & node_label) const {
return this->env.node_del_cost(node_label);
return env_->node_del_cost(node_label);
}

double PyGEDEnv::getNodeInsCost(const std::map<std::string, std::string> & node_label) const {
return this->env.node_ins_cost(node_label);
return env_->node_ins_cost(node_label);
}

std::map<std::string, std::string> PyGEDEnv::getMedianNodeLabel(const std::vector<std::map<std::string, std::string>> & node_labels) const {
return this->env.median_node_label(node_labels);
return env_->median_node_label(node_labels);
}

double PyGEDEnv::getEdgeRelCost(const std::map<std::string, std::string> & edge_label_1, const std::map<std::string, std::string> & edge_label_2) const {
return this->env.edge_rel_cost(edge_label_1, edge_label_2);
return env_->edge_rel_cost(edge_label_1, edge_label_2);
}

double PyGEDEnv::getEdgeDelCost(const std::map<std::string, std::string> & edge_label) const {
return this->env.edge_del_cost(edge_label);
return env_->edge_del_cost(edge_label);
}

double PyGEDEnv::getEdgeInsCost(const std::map<std::string, std::string> & edge_label) const {
return this->env.edge_ins_cost(edge_label);
return env_->edge_ins_cost(edge_label);
}

std::map<std::string, std::string> PyGEDEnv::getMedianEdgeLabel(const std::vector<std::map<std::string, std::string>> & edge_labels) const {
return this->env.median_edge_label(edge_labels);
return env_->median_edge_label(edge_labels);
}

std::string PyGEDEnv::getInitType() const {
return initOptionsToString(this->env.get_init_type());
return initOptionsToString(env_->get_init_type());
}

void PyGEDEnv::computeInducedCost(std::size_t g_id, std::size_t h_id) const {
ged::NodeMap node_map = this->env.get_node_map(g_id, h_id);
this->env.compute_induced_cost(g_id, h_id, node_map);
double PyGEDEnv::computeInducedCost(std::size_t g_id, std::size_t h_id, std::vector<pair<std::size_t, std::size_t>> relation) const {
ged::NodeMap node_map = ged::NodeMap(env_->get_num_nodes(g_id), env_->get_num_nodes(h_id));
for (const auto & assignment : relation) {
node_map.add_assignment(assignment.first, assignment.second);
// std::cout << assignment.first << assignment.second << endl;
}
const std::vector<ged::GEDGraph::NodeID> forward_map = node_map.get_forward_map();
for (std::size_t i{0}; i < node_map.num_source_nodes(); i++) {
if (forward_map.at(i) == ged::GEDGraph::undefined_node()) {
node_map.add_assignment(i, ged::GEDGraph::dummy_node());
}
}
const std::vector<ged::GEDGraph::NodeID> backward_map = node_map.get_backward_map();
for (std::size_t i{0}; i < node_map.num_target_nodes(); i++) {
if (backward_map.at(i) == ged::GEDGraph::undefined_node()) {
node_map.add_assignment(ged::GEDGraph::dummy_node(), i);
}
}
// for (auto & map : node_map.get_forward_map()) {
// std::cout << map << ", ";
// }
// std::cout << endl;
// for (auto & map : node_map.get_backward_map()) {
// std::cout << map << ", ";
// }
env_->compute_induced_cost(g_id, h_id, node_map);
return node_map.induced_cost();
}




// double PyGEDEnv::getNodeCost(std::size_t label1, std::size_t label2) const {
// return this->env.ged_data_node_cost(label1, label2);
// return env_->ged_data_node_cost(label1, label2);
// }


@@ -630,7 +665,7 @@ void PyGEDEnv::computeInducedCost(std::size_t g_id, std::size_t h_id) const {

/*loadGXLGraph(pathFolder, pathXML);
std::vector<std::size_t> graph_ids = getAllGraphIds();
std::size_t median_id = this->env.add_graph("median", "");
std::size_t median_id = env_->add_graph("median", "");

initEnv(initOption);

@@ -640,10 +675,10 @@ void PyGEDEnv::computeInducedCost(std::size_t g_id, std::size_t h_id) const {
median_estimator.set_options("--init-type RANDOM --randomness PSEUDO --seed " + seed);
median_estimator.run(graph_ids, median_id);
std::string gxl_file_name("../output/gen_median_Letter_HIGH_" + letter_class + ".gxl");
this->env.save_as_gxl_graph(median_id, gxl_file_name);*/
env_->save_as_gxl_graph(median_id, gxl_file_name);*/

/*std::string tikz_file_name("../output/gen_median_Letter_HIGH_" + letter_class + ".tex");
save_letter_graph_as_tikz_file(this->env.get_graph(median_id), tikz_file_name);*/
save_letter_graph_as_tikz_file(env_->get_graph(median_id), tikz_file_name);*/
//}

}


+ 1
- 1
gklearn/kernels/__init__.py View File

@@ -12,4 +12,4 @@ from gklearn.kernels.structural_sp import StructuralSP
from gklearn.kernels.shortest_path import ShortestPath
from gklearn.kernels.path_up_to_h import PathUpToH
from gklearn.kernels.treelet import Treelet
from gklearn.kernels.weisfeiler_lehman import WeisfeilerLehman
from gklearn.kernels.weisfeiler_lehman import WeisfeilerLehman, WLSubtree

+ 9
- 8
gklearn/kernels/path_up_to_h.py View File

@@ -18,6 +18,7 @@ import numpy as np
import networkx as nx
from collections import Counter
from functools import partial
from gklearn.utils import SpecialLabel
from gklearn.utils.parallel import parallel_gm, parallel_me
from gklearn.kernels import GraphKernel
from gklearn.utils import Trie
@@ -582,11 +583,11 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func == None
def __add_dummy_labels(self, Gn):
if self.__k_func is not None:
if len(self.__node_labels) == 0:
for G in Gn:
nx.set_node_attributes(G, '0', 'dummy')
self.__node_labels.append('dummy')
if len(self.__edge_labels) == 0:
for G in Gn:
nx.set_edge_attributes(G, '0', 'dummy')
self.__edge_labels.append('dummy')
if len(self.__node_labels) == 0 or (len(self.__node_labels) == 1 and self.__node_labels[0] == SpecialLabel.DUMMY):
for i in range(len(Gn)):
nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY)
self.__node_labels = [SpecialLabel.DUMMY]
if len(self.__edge_labels) == 0 or (len(self.__edge_labels) == 1 and self.__edge_labels[0] == SpecialLabel.DUMMY):
for i in range(len(Gn)):
nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY)
self.__edge_labels = [SpecialLabel.DUMMY]

+ 9
- 8
gklearn/kernels/treelet.py View File

@@ -18,6 +18,7 @@ import numpy as np
import networkx as nx
from collections import Counter
from itertools import chain
from gklearn.utils import SpecialLabel
from gklearn.utils.parallel import parallel_gm, parallel_me
from gklearn.utils.utils import find_all_paths, get_mlti_dim_node_attrs
from gklearn.kernels import GraphKernel
@@ -495,11 +496,11 @@ class Treelet(GraphKernel):
def __add_dummy_labels(self, Gn):
if len(self.__node_labels) == 0:
for G in Gn:
nx.set_node_attributes(G, '0', 'dummy')
self.__node_labels.append('dummy')
if len(self.__edge_labels) == 0:
for G in Gn:
nx.set_edge_attributes(G, '0', 'dummy')
self.__edge_labels.append('dummy')
if len(self.__node_labels) == 0 or (len(self.__node_labels) == 1 and self.__node_labels[0] == SpecialLabel.DUMMY):
for i in range(len(Gn)):
nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY)
self.__node_labels = [SpecialLabel.DUMMY]
if len(self.__edge_labels) == 0 or (len(self.__edge_labels) == 1 and self.__edge_labels[0] == SpecialLabel.DUMMY):
for i in range(len(Gn)):
nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY)
self.__edge_labels = [SpecialLabel.DUMMY]

+ 25
- 7
gklearn/kernels/weisfeiler_lehman.py View File

@@ -16,6 +16,7 @@ import numpy as np
import networkx as nx
from collections import Counter
from functools import partial
from gklearn.utils import SpecialLabel
from gklearn.utils.parallel import parallel_gm
from gklearn.kernels import GraphKernel

@@ -32,6 +33,10 @@ class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge


def _compute_gm_series(self):
if self._verbose >= 2:
import warnings
warnings.warn('A part of the computation is parallelized.')
self.__add_dummy_node_labels(self._graphs)
# for WL subtree kernel
@@ -55,11 +60,16 @@ class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge
def _compute_gm_imap_unordered(self):
if self._verbose >= 2:
raise Warning('Only a part of the computation is parallelized due to the structure of this kernel.')
import warnings
warnings.warn('Only a part of the computation is parallelized due to the structure of this kernel.')
return self._compute_gm_series()
def _compute_kernel_list_series(self, g1, g_list): # @todo: this should be better.
if self._verbose >= 2:
import warnings
warnings.warn('A part of the computation is parallelized.')
self.__add_dummy_node_labels(g_list + [g1])
# for WL subtree kernel
@@ -83,8 +93,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge
def _compute_kernel_list_imap_unordered(self, g1, g_list):
if self._verbose >= 2:
raise Warning('Only a part of the computation is parallelized due to the structure of this kernel.')
return self._compute_gm_imap_unordered()
import warnings
warnings.warn('Only a part of the computation is parallelized due to the structure of this kernel.')
return self._compute_kernel_list_series(g1, g_list)
def _wrapper_kernel_list_do(self, itr):
@@ -459,7 +470,14 @@ class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge
def __add_dummy_node_labels(self, Gn):
if len(self.__node_labels) == 0:
for G in Gn:
nx.set_node_attributes(G, '0', 'dummy')
self.__node_labels.append('dummy')
if len(self.__node_labels) == 0 or (len(self.__node_labels) == 1 and self.__node_labels[0] == SpecialLabel.DUMMY):
for i in range(len(Gn)):
nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY)
self.__node_labels = [SpecialLabel.DUMMY]
class WLSubtree(WeisfeilerLehman):
def __init__(self, **kwargs):
kwargs['base_kernel'] = 'subtree'
super().__init__(**kwargs)

+ 801
- 56
gklearn/preimage/experiments/xp_median_preimage.py
File diff suppressed because it is too large
View File


+ 21
- 47
gklearn/preimage/median_preimage_generator.py View File

@@ -18,6 +18,7 @@ from gklearn.ged.median import MedianGraphEstimator
from gklearn.ged.median import constant_node_costs,mge_options_to_string
from gklearn.gedlib import librariesImport, gedlibpy
from gklearn.utils import Timer
from gklearn.utils.utils import get_graph_kernel_by_name
# from gklearn.utils.dataset import Dataset

class MedianPreimageGenerator(PreimageGenerator):
@@ -81,7 +82,13 @@ class MedianPreimageGenerator(PreimageGenerator):
def run(self):
self.__set_graph_kernel_by_name()
self._graph_kernel = get_graph_kernel_by_name(self._kernel_options['name'],
node_labels=self._dataset.node_labels,
edge_labels=self._dataset.edge_labels,
node_attrs=self._dataset.node_attrs,
edge_attrs=self._dataset.edge_attrs,
ds_infos=self._dataset.get_dataset_infos(keys=['directed']),
kernel_options=self._kernel_options)
# record start time.
start = time.time()
@@ -180,6 +187,10 @@ class MedianPreimageGenerator(PreimageGenerator):
results['itrs'] = self.__itrs
results['converged'] = self.__converged
results['num_updates_ecc'] = self.__num_updates_ecc
results['mge'] = {}
results['mge']['num_decrease_order'] = self.__mge.get_num_times_order_decreased()
results['mge']['num_increase_order'] = self.__mge.get_num_times_order_increased()
results['mge']['num_converged_descents'] = self.__mge.get_num_converged_descents()
return results

@@ -653,27 +664,27 @@ class MedianPreimageGenerator(PreimageGenerator):
ged_env.init(init_option=self.__ged_options['init_option'])
# Set up the madian graph estimator.
mge = MedianGraphEstimator(ged_env, constant_node_costs(self.__ged_options['edit_cost']))
mge.set_refine_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options))
self.__mge = MedianGraphEstimator(ged_env, constant_node_costs(self.__ged_options['edit_cost']))
self.__mge.set_refine_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options))
options = self.__mge_options.copy()
if not 'seed' in options:
options['seed'] = int(round(time.time() * 1000)) # @todo: may not work correctly for possible parallel usage.
# Select the GED algorithm.
mge.set_options(mge_options_to_string(options))
mge.set_label_names(node_labels=self._dataset.node_labels,
self.__mge.set_options(mge_options_to_string(options))
self.__mge.set_label_names(node_labels=self._dataset.node_labels,
edge_labels=self._dataset.edge_labels,
node_attrs=self._dataset.node_attrs,
edge_attrs=self._dataset.edge_attrs)
mge.set_init_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options))
mge.set_descent_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options))
self.__mge.set_init_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options))
self.__mge.set_descent_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options))
# Run the estimator.
mge.run(graph_ids, set_median_id, gen_median_id)
self.__mge.run(graph_ids, set_median_id, gen_median_id)
# Get SODs.
self.__sod_set_median = mge.get_sum_of_distances('initialized')
self.__sod_gen_median = mge.get_sum_of_distances('converged')
self.__sod_set_median = self.__mge.get_sum_of_distances('initialized')
self.__sod_gen_median = self.__mge.get_sum_of_distances('converged')
# Get median graphs.
self.__set_median = ged_env.get_nx_graph(set_median_id)
@@ -722,43 +733,6 @@ class MedianPreimageGenerator(PreimageGenerator):
print('distance in kernel space for generalized median:', self.__k_dis_gen_median)
print('minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset)
print('distance in kernel space for each graph in median set:', k_dis_median_set)

def __set_graph_kernel_by_name(self):
if self._kernel_options['name'] == 'ShortestPath':
from gklearn.kernels import ShortestPath
self._graph_kernel = ShortestPath(node_labels=self._dataset.node_labels,
node_attrs=self._dataset.node_attrs,
ds_infos=self._dataset.get_dataset_infos(keys=['directed']),
**self._kernel_options)
elif self._kernel_options['name'] == 'StructuralSP':
from gklearn.kernels import StructuralSP
self._graph_kernel = StructuralSP(node_labels=self._dataset.node_labels,
edge_labels=self._dataset.edge_labels,
node_attrs=self._dataset.node_attrs,
edge_attrs=self._dataset.edge_attrs,
ds_infos=self._dataset.get_dataset_infos(keys=['directed']),
**self._kernel_options)
elif self._kernel_options['name'] == 'PathUpToH':
from gklearn.kernels import PathUpToH
self._graph_kernel = PathUpToH(node_labels=self._dataset.node_labels,
edge_labels=self._dataset.edge_labels,
ds_infos=self._dataset.get_dataset_infos(keys=['directed']),
**self._kernel_options)
elif self._kernel_options['name'] == 'Treelet':
from gklearn.kernels import Treelet
self._graph_kernel = Treelet(node_labels=self._dataset.node_labels,
edge_labels=self._dataset.edge_labels,
ds_infos=self._dataset.get_dataset_infos(keys=['directed']),
**self._kernel_options)
elif self._kernel_options['name'] == 'WeisfeilerLehman':
from gklearn.kernels import WeisfeilerLehman
self._graph_kernel = WeisfeilerLehman(node_labels=self._dataset.node_labels,
edge_labels=self._dataset.edge_labels,
ds_infos=self._dataset.get_dataset_infos(keys=['directed']),
**self._kernel_options)
else:
raise Exception('The graph kernel given is not defined. Possible choices include: "StructuralSP", "ShortestPath", "PathUpToH", "Treelet", "WeisfeilerLehman".')
# def __clean_graph(self, G, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]):


+ 30
- 7
gklearn/preimage/utils.py View File

@@ -25,7 +25,7 @@ import networkx as nx
import os


def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=True, save_medians=True, plot_medians=True, load_gm='auto', dir_save='', irrelevant_labels=None, edge_required=False):
def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=True, save_medians=True, plot_medians=True, load_gm='auto', dir_save='', irrelevant_labels=None, edge_required=False, cut_range=None):
import os.path
from gklearn.preimage import MedianPreimageGenerator
from gklearn.utils import split_dataset_by_target
@@ -38,7 +38,8 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
dataset_all.trim_dataset(edge_required=edge_required)
if irrelevant_labels is not None:
dataset_all.remove_labels(**irrelevant_labels)
# dataset_all.cut_graphs(range(0, 10))
if cut_range is not None:
dataset_all.cut_graphs(cut_range)
datasets = split_dataset_by_target(dataset_all)

if save_results:
@@ -57,6 +58,9 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
itrs_list = []
converged_list = []
num_updates_ecc_list = []
mge_decrease_order_list = []
mge_increase_order_list = []
mge_converged_order_list = []
nb_sod_sm2gm = [0, 0, 0]
nb_dis_k_sm2gm = [0, 0, 0]
nb_dis_k_gi2sm = [0, 0, 0]
@@ -148,7 +152,10 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
results['runtime_precompute_gm'], results['runtime_optimize_ec'],
results['runtime_generate_preimage'], results['runtime_total'],
results['itrs'], results['converged'],
results['num_updates_ecc']])
results['num_updates_ecc'],
results['mge']['num_decrease_order'] > 0, # @todo: not suitable for multi-start mge
results['mge']['num_increase_order'] > 0,
results['mge']['num_converged_descents'] > 0])
f_detail.close()
# compute result summary.
@@ -164,6 +171,9 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
itrs_list.append(results['itrs'])
converged_list.append(results['converged'])
num_updates_ecc_list.append(results['num_updates_ecc'])
mge_decrease_order_list.append(results['mge']['num_decrease_order'] > 0)
mge_increase_order_list.append(results['mge']['num_increase_order'] > 0)
mge_converged_order_list.append(results['mge']['num_converged_descents'] > 0)
# # SOD SM -> GM
if results['sod_set_median'] > results['sod_gen_median']:
nb_sod_sm2gm[0] += 1
@@ -210,7 +220,11 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
results['runtime_precompute_gm'], results['runtime_optimize_ec'],
results['runtime_generate_preimage'], results['runtime_total'],
results['itrs'], results['converged'],
results['num_updates_ecc'], nb_sod_sm2gm,
results['num_updates_ecc'],
results['mge']['num_decrease_order'] > 0, # @todo: not suitable for multi-start mge
results['mge']['num_increase_order'] > 0,
results['mge']['num_converged_descents'] > 0,
nb_sod_sm2gm,
nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm])
f_summary.close()
@@ -256,6 +270,9 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
itrs_mean = np.mean(itrs_list)
num_converged = np.sum(converged_list)
num_updates_ecc_mean = np.mean(num_updates_ecc_list)
num_mge_decrease_order = np.sum(mge_decrease_order_list)
num_mge_increase_order = np.sum(mge_increase_order_list)
num_mge_converged = np.sum(mge_converged_order_list)
sod_sm2gm_mean = get_relations(np.sign(sod_gm_mean - sod_sm_mean))
dis_k_sm2gm_mean = get_relations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
dis_k_gi2sm_mean = get_relations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
@@ -270,7 +287,9 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
dis_k_gi2sm_mean, dis_k_gi2gm_mean,
time_precompute_gm_mean, time_optimize_ec_mean,
time_generate_mean, time_total_mean, itrs_mean,
num_converged, num_updates_ecc_mean])
num_converged, num_updates_ecc_mean,
num_mge_decrease_order, num_mge_increase_order,
num_mge_converged])
f_summary.close()
# save total pairwise kernel distances.
@@ -300,7 +319,8 @@ def __init_output_file(ds_name, gkernel, fit_method, dir_output):
'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
'dis_k gi -> GM', 'edit cost constants', 'time precompute gm',
'time optimize ec', 'time generate preimage', 'time total',
'itrs', 'converged', 'num updates ecc'])
'itrs', 'converged', 'num updates ecc', 'mge decrease order',
'mge increase order', 'mge converged'])
f_detail.close()
# fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
@@ -312,7 +332,8 @@ def __init_output_file(ds_name, gkernel, fit_method, dir_output):
'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
'dis_k gi -> GM', 'time precompute gm', 'time optimize ec',
'time generate preimage', 'time total', 'itrs', 'num converged',
'num updates ecc', '# SOD SM -> GM', '# dis_k SM -> GM',
'num updates ecc', 'mge num decrease order', 'mge num increase order',
'mge num converged', '# SOD SM -> GM', '# dis_k SM -> GM',
'# dis_k gi -> SM', '# dis_k gi -> GM'])
# 'repeats better SOD SM -> GM',
# 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM',
@@ -418,6 +439,8 @@ def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose, parallel='
Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label,
height=4, base_kernel='subtree', parallel=None,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
else:
raise Exception('The graph kernel "', graph_kernel, '" is not defined.')
# normalization
Kmatrix_diag = Kmatrix.diagonal().copy()


+ 6
- 6
gklearn/tests/test_graph_kernels.py View File

@@ -260,20 +260,20 @@ def test_Treelet(ds_name, parallel):
@pytest.mark.parametrize('ds_name', ['Acyclic'])
#@pytest.mark.parametrize('base_kernel', ['subtree', 'sp', 'edge'])
@pytest.mark.parametrize('base_kernel', ['subtree'])
# @pytest.mark.parametrize('base_kernel', ['subtree'])
@pytest.mark.parametrize('parallel', ['imap_unordered', None])
def test_WeisfeilerLehman(ds_name, parallel, base_kernel):
"""Test Weisfeiler-Lehman kernel.
def test_WLSubtree(ds_name, parallel):
"""Test Weisfeiler-Lehman subtree kernel.
"""
from gklearn.kernels import WeisfeilerLehman
from gklearn.kernels import WLSubtree
dataset = chooseDataset(ds_name)

try:
graph_kernel = WeisfeilerLehman(node_labels=dataset.node_labels,
graph_kernel = WLSubtree(node_labels=dataset.node_labels,
edge_labels=dataset.edge_labels,
ds_infos=dataset.get_dataset_infos(keys=['directed']),
height=2, base_kernel=base_kernel)
height=2)
gram_matrix, run_time = graph_kernel.compute(dataset.graphs,
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
kernel_list, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1:],


+ 1
- 0
gklearn/utils/__init__.py View File

@@ -20,4 +20,5 @@ from gklearn.utils.graph_files import load_dataset, save_dataset
from gklearn.utils.timer import Timer
from gklearn.utils.utils import get_graph_kernel_by_name
from gklearn.utils.utils import compute_gram_matrices_by_class
from gklearn.utils.utils import SpecialLabel
from gklearn.utils.trie import Trie

+ 102
- 30
gklearn/utils/dataset.py View File

@@ -56,13 +56,14 @@ class Dataset(object):
self.__node_attrs = label_names['node_attrs']
self.__edge_labels = label_names['edge_labels']
self.__edge_attrs = label_names['edge_attrs']
self.clean_labels()
def load_graphs(self, graphs, targets=None):
# this has to be followed by set_labels().
self.__graphs = graphs
self.__targets = targets
# self.set_labels_attrs()
# self.set_labels_attrs() # @todo
def load_predefined_dataset(self, ds_name):
@@ -89,6 +90,9 @@ class Dataset(object):
elif ds_name == 'Cuneiform':
ds_file = current_path + '../../datasets/Cuneiform/Cuneiform_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'DD':
ds_file = current_path + '../../datasets/DD/DD_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'Fingerprint':
ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file)
@@ -113,6 +117,9 @@ class Dataset(object):
elif ds_name == 'MUTAG':
ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'PAH':
ds_file = current_path + '../../datasets/PAH/dataset.ds'
self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'SYNTHETIC':
pass
elif ds_name == 'SYNTHETICnew':
@@ -120,11 +127,14 @@ class Dataset(object):
self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'Synthie':
pass
else:
raise Exception('The dataset name "', ds_name, '" is not pre-defined.')
self.__node_labels = label_names['node_labels']
self.__node_attrs = label_names['node_attrs']
self.__edge_labels = label_names['edge_labels']
self.__edge_attrs = label_names['edge_attrs']
self.clean_labels()

def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]):
@@ -138,27 +148,27 @@ class Dataset(object):
# @todo: remove labels which have only one possible values.
if node_labels is None:
self.__node_labels = self.__graphs[0].graph['node_labels']
# # graphs are considered node unlabeled if all nodes have the same label.
# infos.update({'node_labeled': is_nl if node_label_num > 1 else False})
# # graphs are considered node unlabeled if all nodes have the same label.
# infos.update({'node_labeled': is_nl if node_label_num > 1 else False})
if node_attrs is None:
self.__node_attrs = self.__graphs[0].graph['node_attrs']
# for G in Gn:
# for n in G.nodes(data=True):
# if 'attributes' in n[1]:
# return len(n[1]['attributes'])
# return 0
# for G in Gn:
# for n in G.nodes(data=True):
# if 'attributes' in n[1]:
# return len(n[1]['attributes'])
# return 0
if edge_labels is None:
self.__edge_labels = self.__graphs[0].graph['edge_labels']
# # graphs are considered edge unlabeled if all edges have the same label.
# infos.update({'edge_labeled': is_el if edge_label_num > 1 else False})
# # graphs are considered edge unlabeled if all edges have the same label.
# infos.update({'edge_labeled': is_el if edge_label_num > 1 else False})
if edge_attrs is None:
self.__edge_attrs = self.__graphs[0].graph['edge_attrs']
# for G in Gn:
# if nx.number_of_edges(G) > 0:
# for e in G.edges(data=True):
# if 'attributes' in e[2]:
# return len(e[2]['attributes'])
# return 0
# for G in Gn:
# if nx.number_of_edges(G) > 0:
# for e in G.edges(data=True):
# if 'attributes' in e[2]:
# return len(e[2]['attributes'])
# return 0
def get_dataset_infos(self, keys=None):
@@ -323,7 +333,7 @@ class Dataset(object):
if self.__node_label_nums is None:
self.__node_label_nums = {}
for node_label in self.__node_labels:
self.__node_label_nums[node_label] = self.get_node_label_num(node_label)
self.__node_label_nums[node_label] = self.__get_node_label_num(node_label)
infos['node_label_nums'] = self.__node_label_nums
if 'edge_label_dim' in keys:
@@ -335,7 +345,7 @@ class Dataset(object):
if self.__edge_label_nums is None:
self.__edge_label_nums = {}
for edge_label in self.__edge_labels:
self.__edge_label_nums[edge_label] = self.get_edge_label_num(edge_label)
self.__edge_label_nums[edge_label] = self.__get_edge_label_num(edge_label)
infos['edge_label_nums'] = self.__edge_label_nums
if 'directed' in keys or 'substructures' in keys:
@@ -411,33 +421,95 @@ class Dataset(object):
def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]):
node_labels = [item for item in node_labels if item in self.__node_labels]
edge_labels = [item for item in edge_labels if item in self.__edge_labels]
node_attrs = [item for item in node_attrs if item in self.__node_attrs]
edge_attrs = [item for item in edge_attrs if item in self.__edge_attrs]

for g in self.__graphs:
for nd in g.nodes():
for nl in node_labels:
del g.nodes[nd][nl]
del g.nodes[nd][nl]
for na in node_attrs:
del g.nodes[nd][na]
for ed in g.edges():
for el in edge_labels:
del g.edges[ed][el]
del g.edges[ed][el]
for ea in edge_attrs:
del g.edges[ed][ea]
del g.edges[ed][ea]
if len(node_labels) > 0:
self.__node_labels = [nl for nl in self.__node_labels if nl not in node_labels]
self.__node_labels = [nl for nl in self.__node_labels if nl not in node_labels]
if len(edge_labels) > 0:
self.__edge_labels = [el for el in self.__edge_labels if el not in edge_labels]
self.__edge_labels = [el for el in self.__edge_labels if el not in edge_labels]
if len(node_attrs) > 0:
self.__node_attrs = [na for na in self.__node_attrs if na not in node_attrs]
self.__node_attrs = [na for na in self.__node_attrs if na not in node_attrs]
if len(edge_attrs) > 0:
self.__edge_attrs = [ea for ea in self.__edge_attrs if ea not in edge_attrs]
self.__edge_attrs = [ea for ea in self.__edge_attrs if ea not in edge_attrs]
def clean_labels(self):
labels = []
for name in self.__node_labels:
label = set()
for G in self.__graphs:
label = label | set(nx.get_node_attributes(G, name).values())
if len(label) > 1:
labels.append(name)
break
if len(label) < 2:
for G in self.__graphs:
for nd in G.nodes():
del G.nodes[nd][name]
self.__node_labels = labels

labels = []
for name in self.__edge_labels:
label = set()
for G in self.__graphs:
label = label | set(nx.get_edge_attributes(G, name).values())
if len(label) > 1:
labels.append(name)
break
if len(label) < 2:
for G in self.__graphs:
for ed in G.edges():
del G.edges[ed][name]
self.__edge_labels = labels

labels = []
for name in self.__node_attrs:
label = set()
for G in self.__graphs:
label = label | set(nx.get_node_attributes(G, name).values())
if len(label) > 1:
labels.append(name)
break
if len(label) < 2:
for G in self.__graphs:
for nd in G.nodes():
del G.nodes[nd][name]
self.__node_attrs = labels

labels = []
for name in self.__edge_attrs:
label = set()
for G in self.__graphs:
label = label | set(nx.get_edge_attributes(G, name).values())
if len(label) > 1:
labels.append(name)
break
if len(label) < 2:
for G in self.__graphs:
for ed in G.edges():
del G.edges[ed][name]
self.__edge_attrs = labels
def cut_graphs(self, range_):
self.__graphs = [self.__graphs[i] for i in range_]
if self.__targets is not None:
self.__targets = [self.__targets[i] for i in range_]
# @todo
# self.set_labels_attrs()
self.clean_labels()


def trim_dataset(self, edge_required=False):
@@ -448,8 +520,7 @@ class Dataset(object):
idx = [p[0] for p in trimed_pairs]
self.__graphs = [p[1] for p in trimed_pairs]
self.__targets = [self.__targets[i] for i in idx]
# @todo
# self.set_labels_attrs()
self.clean_labels()
def __get_dataset_size(self):
@@ -652,4 +723,5 @@ def split_dataset_by_target(dataset):
sub_dataset.load_graphs(sub_graphs, [key] * len(val))
sub_dataset.set_labels(node_labels=dataset.node_labels, node_attrs=dataset.node_attrs, edge_labels=dataset.edge_labels, edge_attrs=dataset.edge_attrs)
datasets.append(sub_dataset)
# @todo: clean_labels?
return datasets

+ 6
- 6
gklearn/utils/graph_files.py View File

@@ -63,7 +63,7 @@ def load_dataset(filename, filename_targets=None, gformat=None, **kwargs):
return data, y, label_names


def save_dataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None):
def save_dataset(Gn, y, gformat='gxl', group=None, filename='gfile', **kwargs):
"""Save list of graphs.
"""
import os
@@ -73,22 +73,22 @@ def save_dataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=Non
if not os.path.exists(dirname_ds) :
os.makedirs(dirname_ds)
if xparams is not None and 'graph_dir' in xparams:
graph_dir = xparams['graph_dir'] + '/'
if 'graph_dir' in kwargs:
graph_dir = kwargs['graph_dir'] + '/'
if not os.path.exists(graph_dir):
os.makedirs(graph_dir)
del kwargs['graph_dir']
else:
graph_dir = dirname_ds
if group == 'xml' and gformat == 'gxl':
kwargs = {'method': xparams['method']} if xparams is not None else {}
with open(filename + '.xml', 'w') as fgroup:
fgroup.write("<?xml version=\"1.0\"?>")
fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">")
fgroup.write("\n<GraphCollection>")
for idx, g in enumerate(Gn):
fname_tmp = "graph" + str(idx) + ".gxl"
saveGXL(g, graph_dir + fname_tmp, **kwargs)
save_gxl(g, graph_dir + fname_tmp, **kwargs)
fgroup.write("\n\t<graph file=\"" + fname_tmp + "\" class=\"" + str(y[idx]) + "\"/>")
fgroup.write("\n</GraphCollection>")
fgroup.close()
@@ -226,7 +226,7 @@ def load_gxl(filename): # @todo: directed graphs.
return g, label_names


def saveGXL(graph, filename, method='default', node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]):
def save_gxl(graph, filename, method='default', node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]):
if method == 'default':
gxl_file = open(filename, 'w')
gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")


+ 53
- 6
gklearn/utils/utils.py View File

@@ -1,6 +1,7 @@
import networkx as nx
import numpy as np
from copy import deepcopy
from enum import Enum, auto
#from itertools import product

# from tqdm import tqdm
@@ -299,21 +300,59 @@ def get_edge_labels(Gn, edge_label):


def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}):
if name == 'structuralspkernel':
if name == 'ShortestPath':
from gklearn.kernels import ShortestPath
graph_kernel = ShortestPath(node_labels=node_labels,
node_attrs=node_attrs,
ds_infos=ds_infos,
**kernel_options)
elif name == 'StructuralSP':
from gklearn.kernels import StructuralSP
graph_kernel = StructuralSP(node_labels=node_labels, edge_labels=edge_labels,
node_attrs=node_attrs, edge_attrs=edge_attrs,
ds_infos=ds_infos, **kernel_options)
graph_kernel = StructuralSP(node_labels=node_labels,
edge_labels=edge_labels,
node_attrs=node_attrs,
edge_attrs=edge_attrs,
ds_infos=ds_infos,
**kernel_options)
elif name == 'PathUpToH':
from gklearn.kernels import PathUpToH
graph_kernel = PathUpToH(node_labels=node_labels,
edge_labels=edge_labels,
ds_infos=ds_infos,
**kernel_options)
elif name == 'Treelet':
from gklearn.kernels import Treelet
graph_kernel = Treelet(node_labels=node_labels,
edge_labels=edge_labels,
ds_infos=ds_infos,
**kernel_options)
elif name == 'WLSubtree':
from gklearn.kernels import WLSubtree
graph_kernel = WLSubtree(node_labels=node_labels,
edge_labels=edge_labels,
ds_infos=ds_infos,
**kernel_options)
elif name == 'WeisfeilerLehman':
from gklearn.kernels import WeisfeilerLehman
graph_kernel = WeisfeilerLehman(node_labels=node_labels,
edge_labels=edge_labels,
ds_infos=ds_infos,
**kernel_options)
else:
raise Exception('The graph kernel given is not defined. Possible choices include: "StructuralSP", "ShortestPath", "PathUpToH", "Treelet", "WLSubtree", "WeisfeilerLehman".')

return graph_kernel


def compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, dir_save='', irrelevant_labels=None):
def compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, dir_save='', irrelevant_labels=None, edge_required=False):
import os
from gklearn.utils import Dataset, split_dataset_by_target
# 1. get dataset.
print('1. getting dataset...')
dataset_all = Dataset()
dataset_all.load_predefined_dataset(ds_name)
dataset_all.trim_dataset(edge_required=edge_required)
if not irrelevant_labels is None:
dataset_all.remove_labels(**irrelevant_labels)
# dataset_all.cut_graphs(range(0, 10))
@@ -349,6 +388,8 @@ def compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, d
print()
print('4. saving results...')
if save_results:
if not os.path.exists(dir_save):
os.makedirs(dir_save)
np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=run_time_list)

print('\ncomplete.')
@@ -424,4 +465,10 @@ def get_mlti_dim_edge_attrs(G, attr_names):
attributes = []
for ed, attrs in G.edges(data=True):
attributes.append(tuple(attrs[aname] for aname in attr_names))
return attributes
return attributes


class SpecialLabel(Enum):
"""can be used to define special labels.
"""
DUMMY = auto # The dummy label.

Loading…
Cancel
Save