Browse Source

test of a new preimage algorithm.

v0.1
jajupmochi 6 years ago
parent
commit
a13dd94f6a
11 changed files with 638 additions and 140 deletions
  1. +3
    -0
      .gitignore
  2. +196
    -0
      preimage/gk_iam.py
  3. +195
    -0
      preimage/iam.py
  4. +5
    -0
      preimage/librariesImport.py
  5. +5
    -0
      preimage/librariesImport2.py
  6. +1
    -0
      preimage/preimage.py
  7. +26
    -0
      preimage/setup.py
  8. +57
    -0
      preimage/test.py
  9. +3
    -3
      pygraph/utils/graphdataset.py
  10. +89
    -39
      pygraph/utils/graphfiles.py
  11. +58
    -98
      pygraph/utils/model_selection_precomputed.py

+ 3
- 0
.gitignore View File

@@ -20,5 +20,8 @@ pygraph/kernels/*_sym.py
*.dat
*.pyc

preimage/*
!preimage/*.py

__pycache__
##*#

+ 196
- 0
preimage/gk_iam.py View File

@@ -0,0 +1,196 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 30 17:07:43 2019

A graph pre-image method combining iterative pre-image method in reference [1]
and the iterative alternate minimizations (IAM) in reference [2].
@author: ljia
@references:
[1] Gökhan H Bakir, Alexander Zien, and Koji Tsuda. Learning to and graph
pre-images. In Joint Pattern Re ognition Symposium , pages 253-261. Springer, 2004.
[2] Generalized median graph via iterative alternate minimization.
"""
import numpy as np
import multiprocessing
from tqdm import tqdm
import networkx as nx
import matplotlib.pyplot as plt

from iam import iam


def gk_iam(Gn, alpha):
"""This function constructs graph pre-image by the iterative pre-image
framework in reference [1], algorithm 1, where the step of generating new
graphs randomly is replaced by the IAM algorithm in reference [2].
notes
-----
Every time a better graph is acquired, the older one is replaced by it.
"""
# compute k nearest neighbors of phi in DN.
dis_list = [] # distance between g_star and each graph.
for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout):
dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) *
k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha *
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha *
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
dis_list.append(dtemp)
# sort
sort_idx = np.argsort(dis_list)
dis_gs = [dis_list[idis] for idis in sort_idx[0:k]]
g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN
if dis_gs[0] == 0: # the exact pre-image.
print('The exact pre-image is found from the input dataset.')
return 0, g0hat
dhat = dis_gs[0] # the nearest distance
Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors
gihat_list = []
# i = 1
r = 1
while r < r_max:
print('r =', r)
# found = False
Gs_nearest = Gk + gihat_list
g_tmp = iam(Gs_nearest)
# compute distance between phi and the new generated graph.
knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None,
p_quit=lmbda, n_iteration=20, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=False)
dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) *
knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha *
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha *
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
if dnew <= dhat: # the new distance is smaller
print('I am smaller!')
dhat = dnew
g_new = g_tmp.copy() # found better graph.
gihat_list = [g_new]
dis_gs.append(dhat)
r = 0
else:
r += 1
ghat = ([g0hat] if len(gihat_list) == 0 else gihat_list)
return dhat, ghat


def gk_iam_nearest(Gn, alpha):
"""This function constructs graph pre-image by the iterative pre-image
framework in reference [1], algorithm 1, where the step of generating new
graphs randomly is replaced by the IAM algorithm in reference [2].
notes
-----
Every time a better graph is acquired, its distance in kernel space is
compared with the k nearest ones, and the k nearest distances from the k+1
distances will be used as the new ones.
"""
# compute k nearest neighbors of phi in DN.
dis_list = [] # distance between g_star and each graph.
for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout):
dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) *
k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha *
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha *
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
dis_list.append(dtemp)
# sort
sort_idx = np.argsort(dis_list)
dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances
g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN
if dis_gs[0] == 0: # the exact pre-image.
print('The exact pre-image is found from the input dataset.')
return 0, g0hat
dhat = dis_gs[0] # the nearest distance
ghat = g0hat
Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors
Gs_nearest = Gk
# gihat_list = []
# i = 1
r = 1
while r < r_max:
print('r =', r)
# found = False
# Gs_nearest = Gk + gihat_list
g_tmp = iam(Gs_nearest)
# compute distance between phi and the new generated graph.
knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None,
p_quit=lmbda, n_iteration=20, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=False)
dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) *
knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha *
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha *
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
if dnew <= dhat: # the new distance is smaller
print('I am smaller!')
dhat = dnew
g_new = g_tmp.copy() # found better graph.
ghat = g_tmp.copy()
dis_gs.append(dhat) # add the new nearest distance.
Gs_nearest.append(g_new) # add the corresponding graph.
sort_idx = np.argsort(dis_gs)
dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
r = 0
else:
r += 1
return dhat, ghat

if __name__ == '__main__':
import sys
sys.path.insert(0, "../")
from pygraph.kernels.marginalizedKernel import marginalizedkernel
from pygraph.utils.graphfiles import loadDataset
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# Gn = Gn[0:10]
lmbda = 0.03 # termination probalility
r_max = 10 # recursions
l = 500
alpha_range = np.linspace(0.1, 0.9, 9)
k = 5 # k nearest neighbors
# randomly select two molecules
np.random.seed(1)
idx1, idx2 = np.random.randint(0, len(Gn), 2)
g1 = Gn[idx1]
g2 = Gn[idx2]
# compute
k_list = [] # kernel between each graph and itself.
k_g1_list = [] # kernel between each graph and g1
k_g2_list = [] # kernel between each graph and g2
for ig, g in tqdm(enumerate(Gn), desc='computing self kernels', file=sys.stdout):
ktemp = marginalizedkernel([g, g1, g2], node_label='atom', edge_label=None,
p_quit=lmbda, n_iteration=20, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=False)
k_list.append(ktemp[0][0, 0])
k_g1_list.append(ktemp[0][0, 1])
k_g2_list.append(ktemp[0][0, 2])

g_best = []
dis_best = []
# for each alpha
for alpha in alpha_range:
print('alpha =', alpha)
dhat, ghat = gk_iam_nearest(Gn, alpha)
dis_best.append(dhat)
g_best.append(ghat)
for idx, item in enumerate(alpha_range):
print('when alpha is', item, 'the shortest distance is', dis_best[idx])
print('the corresponding pre-image is')
nx.draw_networkx(g_best[idx])
plt.show()

+ 195
- 0
preimage/iam.py View File

@@ -0,0 +1,195 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Apr 26 11:49:12 2019

Iterative alternate minimizations using GED.
@author: ljia
"""
import numpy as np
import random
import networkx as nx

import sys
#from Cython_GedLib_2 import librariesImport, script
import librariesImport, script
sys.path.insert(0, "../")
from pygraph.utils.graphfiles import saveDataset
from pygraph.utils.graphdataset import get_dataset_attributes


def iam(Gn, node_label='atom', edge_label='bond_type'):
"""See my name, then you know what I do.
"""
# Gn = Gn[0:10]
Gn = [nx.convert_node_labels_to_integers(g) for g in Gn]
c_er = 1
c_es = 1
c_ei = 1
# phase 1: initilize.
# compute set-median.
dis_min = np.inf
pi_p = []
pi_all = []
for idx1, G_p in enumerate(Gn):
dist_sum = 0
pi_all.append([])
for idx2, G_p_prime in enumerate(Gn):
dist_tmp, pi_tmp = GED(G_p, G_p_prime)
pi_all[idx1].append(pi_tmp)
dist_sum += dist_tmp
if dist_sum < dis_min:
dis_min = dist_sum
G = G_p.copy()
idx_min = idx1
# list of edit operations.
pi_p = pi_all[idx_min]
# phase 2: iteration.
ds_attrs = get_dataset_attributes(Gn, attr_names=['edge_labeled', 'node_attr_dim'],
edge_label=edge_label)
for itr in range(0, 10):
G_new = G.copy()
# update vertex labels.
# pre-compute h_i0 for each label.
# for label in get_node_labels(Gn, node_label):
# print(label)
# for nd in G.nodes(data=True):
# pass
if not ds_attrs['node_attr_dim']: # labels are symbolic
for nd, _ in G.nodes(data=True):
h_i0_list = []
label_list = []
for label in get_node_labels(Gn, node_label):
h_i0 = 0
for idx, g in enumerate(Gn):
pi_i = pi_p[idx][nd]
if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
h_i0 += 1
h_i0_list.append(h_i0)
label_list.append(label)
# choose one of the best randomly.
idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
idx_rdm = random.randint(0, len(idx_max) - 1)
G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]]
else: # labels are non-symbolic
for nd, _ in G.nodes(data=True):
Si_norm = 0
phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
for idx, g in enumerate(Gn):
pi_i = pi_p[idx][nd]
if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
Si_norm += 1
phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
phi_i_bar /= Si_norm
G_new.nodes[nd]['attributes'] = phi_i_bar
# update edge labels and adjacency matrix.
if ds_attrs['edge_labeled']:
for nd1, nd2, _ in G.edges(data=True):
h_ij0_list = []
label_list = []
for label in get_edge_labels(Gn, edge_label):
h_ij0 = 0
for idx, g in enumerate(Gn):
pi_i = pi_p[idx][nd1]
pi_j = pi_p[idx][nd2]
h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
g.has_edge(pi_i, pi_j) and
g.edges[pi_i, pi_j][edge_label] == label)
h_ij0 += h_ij0_p
h_ij0_list.append(h_ij0)
label_list.append(label)
# choose one of the best randomly.
idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
h_ij0_max = h_ij0_list[idx_max[0]]
idx_rdm = random.randint(0, len(idx_max) - 1)
best_label = label_list[idx_max[idx_rdm]]
# check whether a_ij is 0 or 1.
sij_norm = 0
for idx, g in enumerate(Gn):
pi_i = pi_p[idx][nd1]
pi_j = pi_p[idx][nd2]
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
sij_norm += 1
if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
if not G_new.has_edge(nd1, nd2):
G_new.add_edge(nd1, nd2)
G_new.edges[nd1, nd2][edge_label] = best_label
else:
if G_new.has_edge(nd1, nd2):
G_new.remove_edge(nd1, nd2)
else: # if edges are unlabeled
for nd1, nd2, _ in G.edges(data=True):
sij_norm = 0
for idx, g in enumerate(Gn):
pi_i = pi_p[idx][nd1]
pi_j = pi_p[idx][nd2]
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
sij_norm += 1
if sij_norm > len(Gn) * c_er / (c_er + c_ei):
if not G_new.has_edge(nd1, nd2):
G_new.add_edge(nd1, nd2)
else:
if G_new.has_edge(nd1, nd2):
G_new.remove_edge(nd1, nd2)
G = G_new.copy()
return G


def GED(g1, g2, lib='gedlib'):
"""
Compute GED. It is a dummy function for now.
"""
if lib == 'gedlib':
saveDataset([g1, g2], [None, None], group='xml', filename='ged_tmp/tmp')
script.appel()
script.PyRestartEnv()
script.PyLoadGXLGraph('ged_tmp/', 'collections/tmp.xml')
listID = script.PyGetGraphIds()
script.PySetEditCost("CHEM_1")
script.PyInitEnv()
script.PySetMethod("BIPARTITE", "")
script.PyInitMethod()
g = listID[0]
h = listID[1]
script.PyRunMethod(g, h)
liste = script.PyGetAllMap(g, h)
upper = script.PyGetUpperBound(g, h)
lower = script.PyGetLowerBound(g, h)
dis = upper + lower
pi = liste[0]
return dis, pi


def get_node_labels(Gn, node_label):
nl = set()
for G in Gn:
nl = nl | set(nx.get_node_attributes(G, node_label).values())
return nl


def get_edge_labels(Gn, edge_label):
el = set()
for G in Gn:
el = el | set(nx.get_edge_attributes(G, edge_label).values())
return el


if __name__ == '__main__':
from pygraph.utils.graphfiles import loadDataset
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb
# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
# 'extra_params': {}} # node nsymb
# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
# 'extra_params': {}}
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])

iam(Gn)

+ 5
- 0
preimage/librariesImport.py View File

@@ -0,0 +1,5 @@
from ctypes import *
lib1 = cdll.LoadLibrary('lib/fann/libdoublefann.so')
lib2 = cdll.LoadLibrary('lib/libsvm.3.22/libsvm.so')
lib3 = cdll.LoadLibrary('lib/nomad/libnomad.so')
lib4 = cdll.LoadLibrary('lib/nomad/libsgtelib.so')

+ 5
- 0
preimage/librariesImport2.py View File

@@ -0,0 +1,5 @@
from ctypes import *
lib1 = cdll.LoadLibrary('Cython_GedLib_2/lib/fann/libdoublefann.so')
lib2 = cdll.LoadLibrary('Cython_GedLib_2/lib/libsvm.3.22/libsvm.so')
lib3 = cdll.LoadLibrary('Cython_GedLib_2/lib/nomad/libnomad.so')
lib4 = cdll.LoadLibrary('Cython_GedLib_2/lib/nomad/libsgtelib.so')

+ 1
- 0
preimage/preimage.py View File

@@ -126,6 +126,7 @@ for alpha in alpha_range:
dhat = dnew
gnew = gtemp.copy()
found = True # found better graph.
r = 0
if found:
gihat_list = [gnew]
dis_gs.append(dhat)


+ 26
- 0
preimage/setup.py View File

@@ -0,0 +1,26 @@
#from distutils.core import setup
from distutils.extension import Extension
#from Cython.Distutils import build_ext

from distutils.core import setup
from Cython.Build import cythonize

#setup(ext_modules=cythonize("script.pyx"))

extensions = [Extension("script",
sources=["script.pyx", "src/essai.cpp"],
include_dirs=["include","include/lsape", "include/Eigen", "include/nomad", "include/sgtelib", "include/libsvm.3.22", "include/fann", "include/boost_1_69_0"],
library_dirs=["lib/fann","lib/gedlib", "lib/libsvm.3.22","lib/nomad"],
libraries=["doublefann","sgtelib", "svm", "nomad"],
language="c++",
extra_compile_args=["-std=c++11"],
extra_link_args=["-std=c++11"])]

setup(ext_modules=cythonize(extensions))

#extensions = [Extension("script", sources=["script.pyx", "include/gedlib-master/src/env/ged_env.ipp"], include_dirs=["."], language="c++")]
#setup(name = "script", ext_modules = extensions, cmdclass = {'build_ext':build_ext},)


# Commande Bash : python setup.py build_ext --inplace

+ 57
- 0
preimage/test.py View File

@@ -0,0 +1,57 @@
#export LD_LIBRARY_PATH=.:/export/home/lambertn/Documents/Cython_GedLib_2/lib/fann/:/export/home/lambertn/Documents/Cython_GedLib_2/lib/libsvm.3.22:/export/home/lambertn/Documents/Cython_GedLib_2/lib/nomad

#Pour que "import script" trouve les librairies qu'a besoin GedLib
#Equivalent à définir la variable d'environnement LD_LIBRARY_PATH sur un bash
#Permet de fonctionner sur Idle et autre sans définir à chaque fois la variable d'environnement
#os.environ ne fonctionne pas dans ce cas
import librariesImport, script

#import script

#truc = script.computeEditDistanceOnGXlGraphs('include/gedlib-master/data/datasets/Mutagenicity/data/','collections/MUTA_10.xml',"CHEM_1", "BIPARTITE", "")
#print(truc)
#script.PyRestartEnv()
#script.appel()

def test() :
# script.appel()
script.PyRestartEnv()
# print("Here is the Python function !")
#
# print("List of Edit Cost Options : ")
# for i in script.listOfEditCostOptions :
# print (i)
# print("")
#
# print("List of Method Options : ")
# for j in script.listOfMethodOptions :
# print (j)
# print("")
script.PyLoadGXLGraph('include/gedlib-master/data/datasets/Mutagenicity/data/', 'collections/MUTA_10.xml')
listID = script.PyGetGraphIds()
afficheId = ""
for i in listID :
afficheId+=str(i) + " "
print("Number of graphs = " + str(len(listID)) + ", list of Ids = " + afficheId)

script.PySetEditCost("CHEM_1")

script.PyInitEnv()

script.PySetMethod("BIPARTITE", "")
script.PyInitMethod()

g = listID[0]
h = listID[1]

script.PyRunMethod(g,h)
liste = script.PyGetAllMap(g,h)
print("Forward map : " ,liste[0], ", Backward map : ", liste[1])
print ("Upper Bound = " + str(script.PyGetUpperBound(g,h)) + ", Lower Bound = " + str(script.PyGetLowerBound(g,h)) + ", Runtime = " + str(script.PyGetRuntime(g,h)))


test()

+ 3
- 3
pygraph/utils/graphdataset.py View File

@@ -52,10 +52,10 @@ def get_dataset_attributes(Gn,
return False if edge_label is None else True

def get_edge_label_num(Gn):
nl = set()
el = set()
for G in Gn:
nl = nl | set(nx.get_edge_attributes(G, edge_label).values())
return len(nl)
el = el | set(nx.get_edge_attributes(G, edge_label).values())
return len(el)

def is_directed(Gn):
return nx.is_directed(Gn[0])


+ 89
- 39
pygraph/utils/graphfiles.py View File

@@ -22,8 +22,8 @@ def loadCT(filename):
with open(filename) as f:
content = f.read().splitlines()
g = nx.Graph(
name=str(content[0]),
filename=basename(filename)) # set name of the graph
name = str(content[0]),
filename = basename(filename)) # set name of the graph
tmp = content[1].split(" ")
if tmp[0] == '':
nb_nodes = int(tmp[1]) # number of the nodes
@@ -84,43 +84,63 @@ def loadGXL(filename):
return g


def saveGXL(graph, filename):
import xml.etree.ElementTree as ET
root_node = ET.Element('gxl')
attr = dict()
attr['id'] = graph.graph['name']
attr['edgeids'] = 'true'
attr['edgemode'] = 'undirected'
graph_node = ET.SubElement(root_node, 'graph', attrib=attr)

for v in graph:
current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)})
for attr in graph.nodes[v].keys():
cur_attr = ET.SubElement(
current_node, 'attr', attrib={'name': attr})
cur_value = ET.SubElement(cur_attr,
graph.nodes[v][attr].__class__.__name__)
cur_value.text = graph.nodes[v][attr]

for v1 in graph:
for v2 in graph[v1]:
if (v1 < v2): # Non oriented graphs
cur_edge = ET.SubElement(
graph_node,
'edge',
attrib={
'from': str(v1),
'to': str(v2)
})
for attr in graph[v1][v2].keys():
cur_attr = ET.SubElement(
cur_edge, 'attr', attrib={'name': attr})
cur_value = ET.SubElement(
cur_attr, graph[v1][v2][attr].__class__.__name__)
cur_value.text = str(graph[v1][v2][attr])

tree = ET.ElementTree(root_node)
tree.write(filename)
def saveGXL(graph, filename, method='benoit'):
if method == 'benoit':
import xml.etree.ElementTree as ET
root_node = ET.Element('gxl')
attr = dict()
attr['id'] = str(graph.graph['name'])
attr['edgeids'] = 'true'
attr['edgemode'] = 'undirected'
graph_node = ET.SubElement(root_node, 'graph', attrib=attr)
for v in graph:
current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)})
for attr in graph.nodes[v].keys():
cur_attr = ET.SubElement(
current_node, 'attr', attrib={'name': attr})
cur_value = ET.SubElement(cur_attr,
graph.nodes[v][attr].__class__.__name__)
cur_value.text = graph.nodes[v][attr]
for v1 in graph:
for v2 in graph[v1]:
if (v1 < v2): # Non oriented graphs
cur_edge = ET.SubElement(
graph_node,
'edge',
attrib={
'from': str(v1),
'to': str(v2)
})
for attr in graph[v1][v2].keys():
cur_attr = ET.SubElement(
cur_edge, 'attr', attrib={'name': attr})
cur_value = ET.SubElement(
cur_attr, graph[v1][v2][attr].__class__.__name__)
cur_value.text = str(graph[v1][v2][attr])
tree = ET.ElementTree(root_node)
tree.write(filename)
elif method == 'gedlib':
# reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22
pass
# gxl_file = open(filename, 'w')
# gxl_file.write("<?xml version=\"1.0\"?>\n")
# gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n")
# gxl_file.write("<gxl>\n")
# gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">\n")
# for v in graph:
# gxl_file.write("<node id=\"_" + str(v) + "\">\n")
# gxl_file.write("<attr name=\"chem\"><int>" + str(self.node_labels[node]) + "</int></attr>\n")
# gxl_file.write("</node>\n")
# for edge in self.edge_list:
# gxl_file.write("<edge from=\"_" + str(edge[0]) + "\" to=\"_" + str(edge[1]) + "\">\n")
# gxl_file.write("<attr name=\"valence\"><int>1</int></attr>\n")
# gxl_file.write("</edge>\n")
# gxl_file.write("</graph>\n")
# gxl_file.write("</gxl>\n")
# gxl_file.close()


def loadSDF(filename):
@@ -412,3 +432,33 @@ def loadDataset(filename, filename_y=None, extra_params=None):
# print(g.edges(data=True))

return data, y


def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile'):
"""Save list of graphs.
"""
import os
dirname_ds = os.path.dirname(filename)
if dirname_ds != '':
dirname_ds += '/'
if not os.path.exists(dirname_ds) :
os.makedirs(dirname_ds)
if group == 'xml' and gformat == 'gxl':
with open(filename + '.xml', 'w') as fgroup:
fgroup.write("<?xml version=\"1.0\"?>")
fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"https://dbblumenthal.github.io/gedlib/GraphCollection_8dtd_source.html\">")
fgroup.write("\n<GraphCollection>")
for idx, g in enumerate(Gn):
fname_tmp = "graph" + str(idx) + ".gxl"
saveGXL(g, dirname_ds + fname_tmp)
fgroup.write("\n\t<graph file=\"" + fname_tmp + "\" class=\"" + str(y[idx]) + "\"/>")
fgroup.write("\n</GraphCollection>")
fgroup.close()
if __name__ == '__main__':
ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat',
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb
Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
saveDataset(Gn, y, group='xml', filename='temp/temp')

+ 58
- 98
pygraph/utils/model_selection_precomputed.py View File

@@ -420,55 +420,6 @@ def model_selection_for_precomputed_kernel(datafile,
# np.save(results_name_pre + 'best_gram_matrix_time.dt',
# best_gram_matrix_time)
# print out as table.
from collections import OrderedDict
from tabulate import tabulate
table_dict = {}
if model_type == 'regression':
for param_in in param_list:
param_in['alpha'] = '{:.2e}'.format(param_in['alpha'])
else:
for param_in in param_list:
param_in['C'] = '{:.2e}'.format(param_in['C'])
table_dict['params'] = [{**param_out, **param_in}
for param_in in param_list for param_out in param_list_pre_revised]
table_dict['gram_matrix_time'] = [
'{:.2f}'.format(gram_matrix_time[index_out])
for param_in in param_list
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['valid_perf'] = [
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in],
std_val_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['test_perf'] = [
'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in],
std_perf_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['train_perf'] = [
'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in],
std_train_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
keyorder = [
'params', 'train_perf', 'valid_perf', 'test_perf',
'gram_matrix_time'
]
if verbose:
print()
tb_print = tabulate(
OrderedDict(
sorted(table_dict.items(),
key=lambda i: keyorder.index(i[0]))),
headers='keys')
# print(tb_print)
str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print
# read gram matrices from file.
else:
# Grid of parameters with a discrete number of values for each.
@@ -632,58 +583,16 @@ def model_selection_for_precomputed_kernel(datafile,
# str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt)
str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster)

# print out as table.
from collections import OrderedDict
from tabulate import tabulate
table_dict = {}
if model_type == 'regression':
for param_in in param_list:
param_in['alpha'] = '{:.2e}'.format(param_in['alpha'])
else:
for param_in in param_list:
param_in['C'] = '{:.2e}'.format(param_in['C'])
table_dict['params'] = [{**param_out, **param_in}
for param_in in param_list for param_out in param_list_pre_revised]
# table_dict['gram_matrix_time'] = [
# '{:.2f}'.format(gram_matrix_time[index_out])
# for param_in in param_list
# for index_out, _ in enumerate(param_list_pre_revised)
# ]
table_dict['valid_perf'] = [
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in],
std_val_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['test_perf'] = [
'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in],
std_perf_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['train_perf'] = [
'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in],
std_train_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
keyorder = [
'params', 'train_perf', 'valid_perf', 'test_perf'
]
if verbose:
print()
tb_print = tabulate(
OrderedDict(
sorted(table_dict.items(),
key=lambda i: keyorder.index(i[0]))),
headers='keys')
# print(tb_print)
str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print

# open file to save all results for this dataset.
if not os.path.exists(results_dir):
os.makedirs(results_dir)
# print out results as table.
str_fw += printResultsInTable(param_list, param_list_pre_revised, average_val_scores,
std_val_scores, average_perf_scores, std_perf_scores,
average_train_scores, std_train_scores, gram_matrix_time,
model_type, verbose)
# open file to save all results for this dataset.
if not os.path.exists(results_dir + '/' + ds_name + '.output.txt'):
with open(results_dir + '/' + ds_name + '.output.txt', 'w') as f:
@@ -974,4 +883,55 @@ def read_gram_matrices_from_file(results_dir, ds_name):
gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed
param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones
y = gmfile['y'].tolist()
return gram_matrices, param_list_pre_revised, y
return gram_matrices, param_list_pre_revised, y


def printResultsInTable(param_list, param_list_pre_revised, average_val_scores,
std_val_scores, average_perf_scores, std_perf_scores,
average_train_scores, std_train_scores, gram_matrix_time,
model_type, verbose):
from collections import OrderedDict
from tabulate import tabulate
table_dict = {}
if model_type == 'regression':
for param_in in param_list:
param_in['alpha'] = '{:.2e}'.format(param_in['alpha'])
else:
for param_in in param_list:
param_in['C'] = '{:.2e}'.format(param_in['C'])
table_dict['params'] = [{**param_out, **param_in}
for param_in in param_list for param_out in param_list_pre_revised]
table_dict['gram_matrix_time'] = [
'{:.2f}'.format(gram_matrix_time[index_out])
for param_in in param_list
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['valid_perf'] = [
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in],
std_val_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['test_perf'] = [
'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in],
std_perf_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['train_perf'] = [
'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in],
std_train_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
keyorder = [
'params', 'train_perf', 'valid_perf', 'test_perf',
'gram_matrix_time'
]
if verbose:
print()
tb_print = tabulate(OrderedDict(sorted(table_dict.items(),
key=lambda i: keyorder.index(i[0]))), headers='keys')
# print(tb_print)
return 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print

Loading…
Cancel
Save