Browse Source

Merge pull request #13 from jajupmochi/v0.2

Update function gklearn.utils.graph_file.load_gxl()
tags/v0.2.0
linlin GitHub 5 years ago
parent
commit
84ebbcd2d4
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 606 additions and 36 deletions
  1. +0
    -2
      .travis.yml
  2. +549
    -0
      gklearn/preimage/experiments/xp_median_preimage.py
  3. +4
    -1
      gklearn/utils/dataset.py
  4. +53
    -33
      gklearn/utils/graph_files.py

+ 0
- 2
.travis.yml View File

@@ -1,7 +1,5 @@
language: python language: python
python: python:
- '3'
- '3.4'
- '3.5' - '3.5'
- '3.6' - '3.6'
- '3.7' - '3.7'


+ 549
- 0
gklearn/preimage/experiments/xp_median_preimage.py View File

@@ -0,0 +1,549 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 14 15:39:29 2020

@author: ljia
"""
import multiprocessing
import functools
from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct
from gklearn.preimage.utils import generate_median_preimages_by_class
from gklearn.utils import compute_gram_matrices_by_class


def xp_median_preimage_8_1():
"""xp 8_1: Monoterpenoides, sspkernel, using CONSTANT.
"""
# set parameters.
ds_name = 'Monoterpenoides' #
mpg_options = {'fit_method': 'k-graphs',
'init_ecc': [3, 3, 1, 3, 3, 1], #
'ds_name': ds_name,
'parallel': True, # False
'time_limit_in_sec': 0,
'max_itrs': 100, #
'max_itrs_without_update': 3,
'epsilon_residual': 0.01,
'epsilon_ec': 0.1,
'verbose': 2}
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}
kernel_options = {'name': 'structuralspkernel',
'edge_weight': None,
'node_kernels': sub_kernels,
'edge_kernels': sub_kernels,
'compute_method': 'naive',
'parallel': 'imap_unordered',
# 'parallel': None,
'n_jobs': multiprocessing.cpu_count(),
'normalize': True,
'verbose': 2}
ged_options = {'method': 'IPFP',
'initialization_method': 'RANDOM', # 'NODE'
'initial_solutions': 10, # 1
'edit_cost': 'CONSTANT', #
'attr_distance': 'euclidean',
'ratio_runs_from_initial_solutions': 1,
'threads': multiprocessing.cpu_count(),
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'}
mge_options = {'init_type': 'MEDOID',
'random_inits': 10,
'time_limit': 600,
'verbose': 2,
'refine': False}
save_results = True
dir_save='../results/xp_median_preimage/'
irrelevant_labels = None #
edge_required = False #
# print settings.
print('parameters:')
print('dataset name:', ds_name)
print('mpg_options:', mpg_options)
print('kernel_options:', kernel_options)
print('ged_options:', ged_options)
print('mge_options:', mge_options)
print('save_results:', save_results)
print('irrelevant_labels:', irrelevant_labels)
print()
# generate preimages.
for fit_method in ['k-graphs', 'expert', 'random', 'random', 'random']:
print('\n-------------------------------------')
print('fit method:', fit_method, '\n')
mpg_options['fit_method'] = fit_method
generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required)

def xp_median_preimage_7_1():
"""xp 7_1: MUTAG, sspkernel, using CONSTANT.
"""
# set parameters.
ds_name = 'MUTAG' #
mpg_options = {'fit_method': 'k-graphs',
'init_ecc': [4, 4, 2, 1, 1, 1], #
'ds_name': ds_name,
'parallel': True, # False
'time_limit_in_sec': 0,
'max_itrs': 100, #
'max_itrs_without_update': 3,
'epsilon_residual': 0.01,
'epsilon_ec': 0.1,
'verbose': 2}
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}
kernel_options = {'name': 'structuralspkernel',
'edge_weight': None,
'node_kernels': sub_kernels,
'edge_kernels': sub_kernels,
'compute_method': 'naive',
'parallel': 'imap_unordered',
# 'parallel': None,
'n_jobs': multiprocessing.cpu_count(),
'normalize': True,
'verbose': 2}
ged_options = {'method': 'IPFP',
'initialization_method': 'RANDOM', # 'NODE'
'initial_solutions': 10, # 1
'edit_cost': 'CONSTANT', #
'attr_distance': 'euclidean',
'ratio_runs_from_initial_solutions': 1,
'threads': multiprocessing.cpu_count(),
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'}
mge_options = {'init_type': 'MEDOID',
'random_inits': 10,
'time_limit': 600,
'verbose': 2,
'refine': False}
save_results = True
dir_save='../results/xp_median_preimage/'
irrelevant_labels = None #
edge_required = False #
# print settings.
print('parameters:')
print('dataset name:', ds_name)
print('mpg_options:', mpg_options)
print('kernel_options:', kernel_options)
print('ged_options:', ged_options)
print('mge_options:', mge_options)
print('save_results:', save_results)
print('irrelevant_labels:', irrelevant_labels)
print()
# generate preimages.
for fit_method in ['k-graphs', 'expert', 'random', 'random', 'random']:
print('\n-------------------------------------')
print('fit method:', fit_method, '\n')
mpg_options['fit_method'] = fit_method
generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required)


def xp_median_preimage_6_1():
"""xp 6_1: COIL-RAG, sspkernel, using NON_SYMBOLIC.
"""
# set parameters.
ds_name = 'COIL-RAG' #
mpg_options = {'fit_method': 'k-graphs',
'init_ecc': [3, 3, 1, 3, 3, 1], #
'ds_name': ds_name,
'parallel': True, # False
'time_limit_in_sec': 0,
'max_itrs': 100,
'max_itrs_without_update': 3,
'epsilon_residual': 0.01,
'epsilon_ec': 0.1,
'verbose': 2}
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}
kernel_options = {'name': 'structuralspkernel',
'edge_weight': None,
'node_kernels': sub_kernels,
'edge_kernels': sub_kernels,
'compute_method': 'naive',
'parallel': 'imap_unordered',
# 'parallel': None,
'n_jobs': multiprocessing.cpu_count(),
'normalize': True,
'verbose': 2}
ged_options = {'method': 'IPFP',
'initialization_method': 'RANDOM', # 'NODE'
'initial_solutions': 10, # 1
'edit_cost': 'NON_SYMBOLIC', #
'attr_distance': 'euclidean',
'ratio_runs_from_initial_solutions': 1,
'threads': multiprocessing.cpu_count(),
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'}
mge_options = {'init_type': 'MEDOID',
'random_inits': 10,
'time_limit': 600,
'verbose': 2,
'refine': False}
save_results = True
dir_save='../results/xp_median_preimage/'
irrelevant_labels = None #
edge_required = False #
# print settings.
print('parameters:')
print('dataset name:', ds_name)
print('mpg_options:', mpg_options)
print('kernel_options:', kernel_options)
print('ged_options:', ged_options)
print('mge_options:', mge_options)
print('save_results:', save_results)
print('irrelevant_labels:', irrelevant_labels)
print()
# generate preimages.
for fit_method in ['k-graphs', 'random', 'random', 'random']:
print('\n-------------------------------------')
print('fit method:', fit_method, '\n')
mpg_options['fit_method'] = fit_method
generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required)


def xp_median_preimage_5_1():
"""xp 5_1: FRANKENSTEIN, sspkernel, using NON_SYMBOLIC.
"""
# set parameters.
ds_name = 'FRANKENSTEIN' #
mpg_options = {'fit_method': 'k-graphs',
'init_ecc': [3, 3, 1, 3, 3, 0], #
'ds_name': ds_name,
'parallel': True, # False
'time_limit_in_sec': 0,
'max_itrs': 100,
'max_itrs_without_update': 3,
'epsilon_residual': 0.01,
'epsilon_ec': 0.1,
'verbose': 2}
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}
kernel_options = {'name': 'structuralspkernel',
'edge_weight': None,
'node_kernels': sub_kernels,
'edge_kernels': sub_kernels,
'compute_method': 'naive',
'parallel': 'imap_unordered',
# 'parallel': None,
'n_jobs': multiprocessing.cpu_count(),
'normalize': True,
'verbose': 2}
ged_options = {'method': 'IPFP',
'initialization_method': 'RANDOM', # 'NODE'
'initial_solutions': 10, # 1
'edit_cost': 'NON_SYMBOLIC',
'attr_distance': 'euclidean',
'ratio_runs_from_initial_solutions': 1,
'threads': multiprocessing.cpu_count(),
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'}
mge_options = {'init_type': 'MEDOID',
'random_inits': 10,
'time_limit': 600,
'verbose': 2,
'refine': False}
save_results = True
dir_save='../results/xp_median_preimage/'
irrelevant_labels = None #
edge_required = False #
# print settings.
print('parameters:')
print('dataset name:', ds_name)
print('mpg_options:', mpg_options)
print('kernel_options:', kernel_options)
print('ged_options:', ged_options)
print('mge_options:', mge_options)
print('save_results:', save_results)
print('irrelevant_labels:', irrelevant_labels)
print()
# generate preimages.
for fit_method in ['k-graphs', 'random', 'random', 'random']:
print('\n-------------------------------------')
print('fit method:', fit_method, '\n')
mpg_options['fit_method'] = fit_method
generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required)

def xp_median_preimage_4_1():
"""xp 4_1: COLORS-3, sspkernel, using NON_SYMBOLIC.
"""
# set parameters.
ds_name = 'COLORS-3' #
mpg_options = {'fit_method': 'k-graphs',
'init_ecc': [3, 3, 1, 3, 3, 0], #
'ds_name': ds_name,
'parallel': True, # False
'time_limit_in_sec': 0,
'max_itrs': 100,
'max_itrs_without_update': 3,
'epsilon_residual': 0.01,
'epsilon_ec': 0.1,
'verbose': 2}
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}
kernel_options = {'name': 'structuralspkernel',
'edge_weight': None,
'node_kernels': sub_kernels,
'edge_kernels': sub_kernels,
'compute_method': 'naive',
'parallel': 'imap_unordered',
# 'parallel': None,
'n_jobs': multiprocessing.cpu_count(),
'normalize': True,
'verbose': 2}
ged_options = {'method': 'IPFP',
'initialization_method': 'RANDOM', # 'NODE'
'initial_solutions': 10, # 1
'edit_cost': 'NON_SYMBOLIC',
'attr_distance': 'euclidean',
'ratio_runs_from_initial_solutions': 1,
'threads': multiprocessing.cpu_count(),
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'}
mge_options = {'init_type': 'MEDOID',
'random_inits': 10,
'time_limit': 600,
'verbose': 2,
'refine': False}
save_results = True
dir_save='../results/xp_median_preimage/'
irrelevant_labels = None #
edge_required = False #
# print settings.
print('parameters:')
print('dataset name:', ds_name)
print('mpg_options:', mpg_options)
print('kernel_options:', kernel_options)
print('ged_options:', ged_options)
print('mge_options:', mge_options)
print('save_results:', save_results)
print('irrelevant_labels:', irrelevant_labels)
print()
# generate preimages.
for fit_method in ['k-graphs', 'random', 'random', 'random']:
print('\n-------------------------------------')
print('fit method:', fit_method, '\n')
mpg_options['fit_method'] = fit_method
generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required)


def xp_median_preimage_3_1():
"""xp 3_1: Fingerprint, sspkernel, using LETTER2, only node attrs.
"""
# set parameters.
ds_name = 'Fingerprint' #
mpg_options = {'fit_method': 'k-graphs',
'init_ecc': [0.525, 0.525, 0.001, 0.125, 0.125], #
'ds_name': ds_name,
'parallel': True, # False
'time_limit_in_sec': 0,
'max_itrs': 100,
'max_itrs_without_update': 3,
'epsilon_residual': 0.01,
'epsilon_ec': 0.1,
'verbose': 2}
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}
kernel_options = {'name': 'structuralspkernel',
'edge_weight': None,
'node_kernels': sub_kernels,
'edge_kernels': sub_kernels,
'compute_method': 'naive',
'parallel': 'imap_unordered',
# 'parallel': None,
'n_jobs': multiprocessing.cpu_count(),
'normalize': True,
'verbose': 2}
ged_options = {'method': 'IPFP',
'initialization_method': 'RANDOM', # 'NODE'
'initial_solutions': 10, # 1
'edit_cost': 'LETTER2',
'attr_distance': 'euclidean',
'ratio_runs_from_initial_solutions': 1,
'threads': multiprocessing.cpu_count(),
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'}
mge_options = {'init_type': 'MEDOID',
'random_inits': 10,
'time_limit': 600,
'verbose': 2,
'refine': False}
save_results = True
dir_save='../results/xp_median_preimage/'
irrelevant_labels = {'edge_attrs': ['orient', 'angle']} #
edge_required = False #
# print settings.
print('parameters:')
print('dataset name:', ds_name)
print('mpg_options:', mpg_options)
print('kernel_options:', kernel_options)
print('ged_options:', ged_options)
print('mge_options:', mge_options)
print('save_results:', save_results)
print('irrelevant_labels:', irrelevant_labels)
print()
# generate preimages.
for fit_method in ['k-graphs', 'random', 'random', 'random']:
print('\n-------------------------------------')
print('fit method:', fit_method, '\n')
mpg_options['fit_method'] = fit_method
generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required)

def xp_median_preimage_2_1():
"""xp 2_1: COIL-DEL, sspkernel, using LETTER2, only node attrs.
"""
# set parameters.
ds_name = 'COIL-DEL' #
mpg_options = {'fit_method': 'k-graphs',
'init_ecc': [3, 3, 1, 3, 3],
'ds_name': ds_name,
'parallel': True, # False
'time_limit_in_sec': 0,
'max_itrs': 100,
'max_itrs_without_update': 3,
'epsilon_residual': 0.01,
'epsilon_ec': 0.1,
'verbose': 2}
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}
kernel_options = {'name': 'structuralspkernel',
'edge_weight': None,
'node_kernels': sub_kernels,
'edge_kernels': sub_kernels,
'compute_method': 'naive',
'parallel': 'imap_unordered',
# 'parallel': None,
'n_jobs': multiprocessing.cpu_count(),
'normalize': True,
'verbose': 2}
ged_options = {'method': 'IPFP',
'initialization_method': 'RANDOM', # 'NODE'
'initial_solutions': 10, # 1
'edit_cost': 'LETTER2',
'attr_distance': 'euclidean',
'ratio_runs_from_initial_solutions': 1,
'threads': multiprocessing.cpu_count(),
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'}
mge_options = {'init_type': 'MEDOID',
'random_inits': 10,
'time_limit': 600,
'verbose': 2,
'refine': False}
save_results = True
dir_save='../results/xp_median_preimage/'
irrelevant_labels = {'edge_labels': ['valence']}
# print settings.
print('parameters:')
print('dataset name:', ds_name)
print('mpg_options:', mpg_options)
print('kernel_options:', kernel_options)
print('ged_options:', ged_options)
print('mge_options:', mge_options)
print('save_results:', save_results)
print('irrelevant_labels:', irrelevant_labels)
print()
# # compute gram matrices for each class a priori.
# print('Compute gram matrices for each class a priori.')
# compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, dir_save=dir_save, irrelevant_labels=irrelevant_labels)
# generate preimages.
for fit_method in ['k-graphs', 'random', 'random', 'random']:
print('\n-------------------------------------')
print('fit method:', fit_method, '\n')
mpg_options['fit_method'] = fit_method
generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels)


def xp_median_preimage_1_1():
"""xp 1_1: Letter-high, sspkernel.
"""
# set parameters.
ds_name = 'Letter-high'
mpg_options = {'fit_method': 'k-graphs',
'init_ecc': [3, 3, 1, 3, 3],
'ds_name': ds_name,
'parallel': True, # False
'time_limit_in_sec': 0,
'max_itrs': 100,
'max_itrs_without_update': 3,
'epsilon_residual': 0.01,
'epsilon_ec': 0.1,
'verbose': 2}
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}
kernel_options = {'name': 'structuralspkernel',
'edge_weight': None,
'node_kernels': sub_kernels,
'edge_kernels': sub_kernels,
'compute_method': 'naive',
'parallel': 'imap_unordered',
# 'parallel': None,
'n_jobs': multiprocessing.cpu_count(),
'normalize': True,
'verbose': 2}
ged_options = {'method': 'IPFP',
'initialization_method': 'RANDOM', # 'NODE'
'initial_solutions': 1, # 1
'edit_cost': 'LETTER2',
'attr_distance': 'euclidean',
'ratio_runs_from_initial_solutions': 1,
'threads': multiprocessing.cpu_count(),
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'}
mge_options = {'init_type': 'MEDOID',
'random_inits': 10,
'time_limit': 600,
'verbose': 2,
'refine': False}
save_results = True
# print settings.
print('parameters:')
print('dataset name:', ds_name)
print('mpg_options:', mpg_options)
print('kernel_options:', kernel_options)
print('ged_options:', ged_options)
print('mge_options:', mge_options)
print('save_results:', save_results)
# generate preimages.
for fit_method in ['k-graphs', 'expert', 'random', 'random', 'random']:
print('\n-------------------------------------')
print('fit method:', fit_method, '\n')
mpg_options['fit_method'] = fit_method
generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save='../results/xp_median_preimage/')


if __name__ == "__main__":
#### xp 1_1: Letter-high, sspkernel.
# xp_median_preimage_1_1()
#### xp 2_1: COIL-DEL, sspkernel, using LETTER2, only node attrs.
# xp_median_preimage_2_1()
#### xp 3_1: Fingerprint, sspkernel, using LETTER2, only node attrs.
# xp_median_preimage_3_1()

#### xp 4_1: COLORS-3, sspkernel, using NON_SYMBOLIC.
# xp_median_preimage_4_1()
#### xp 5_1: FRANKENSTEIN, sspkernel, using NON_SYMBOLIC.
# xp_median_preimage_5_1()
#### xp 6_1: COIL-RAG, sspkernel, using NON_SYMBOLIC.
# xp_median_preimage_6_1()

#### xp 7_1: MUTAG, sspkernel, using CONSTANT.
# xp_median_preimage_7_1()
#### xp 8_1: Monoterpenoides, sspkernel, using CONSTANT.
xp_median_preimage_8_1()

+ 4
- 1
gklearn/utils/dataset.py View File

@@ -67,7 +67,7 @@ class Dataset(object):
def load_predefined_dataset(self, ds_name): def load_predefined_dataset(self, ds_name):
current_path = os.path.dirname(os.path.realpath(__file__)) + '/' current_path = os.path.dirname(os.path.realpath(__file__)) + '/'
if ds_name == 'acyclic':
if ds_name == 'Acyclic':
pass pass
elif ds_name == 'COIL-DEL': elif ds_name == 'COIL-DEL':
ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt' ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt'
@@ -93,6 +93,9 @@ class Dataset(object):
elif ds_name == 'Letter-med': # node non-symb elif ds_name == 'Letter-med': # node non-symb
ds_file = current_path + '../../datasets/Letter-high/Letter-med_A.txt' ds_file = current_path + '../../datasets/Letter-high/Letter-med_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file) self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'Monoterpenoides':
ds_file = current_path + '../../datasets/Monoterpenoides/dataset_10+.ds'
self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'MUTAG': elif ds_name == 'MUTAG':
ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt' ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file) self.__graphs, self.__targets, label_names = load_dataset(ds_file)


+ 53
- 33
gklearn/utils/graph_files.py View File

@@ -47,7 +47,7 @@ def load_dataset(filename, filename_targets=None, gformat=None, **kwargs):
""" """
extension = splitext(filename)[1][1:] extension = splitext(filename)[1][1:]
if extension == "ds": if extension == "ds":
data, y = loadFromDS(filename, filename_targets)
data, y, label_names = load_from_ds(filename, filename_targets)
elif extension == "cxl": elif extension == "cxl":
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
@@ -59,7 +59,7 @@ def load_dataset(filename, filename_targets=None, gformat=None, **kwargs):
for graph in root.iter('graph'): for graph in root.iter('graph'):
mol_filename = graph.attrib['file'] mol_filename = graph.attrib['file']
mol_class = graph.attrib['class'] mol_class = graph.attrib['class']
data.append(loadGXL(dirname_dataset + '/' + mol_filename))
data.append(load_gxl(dirname_dataset + '/' + mol_filename))
y.append(mol_class) y.append(mol_class)
elif extension == 'xml': elif extension == 'xml':
dir_dataset = kwargs.get('dirname_dataset', None) dir_dataset = kwargs.get('dirname_dataset', None)
@@ -127,7 +127,7 @@ def save_dataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=Non
fgroup.close() fgroup.close()




def loadCT(filename):
def load_ct(filename):
"""load data from a Chemical Table (.ct) file. """load data from a Chemical Table (.ct) file.


Notes Notes
@@ -180,7 +180,7 @@ def loadCT(filename):
return g return g




def loadGXL(filename):
def load_gxl(filename): # @todo: directed graphs.
from os.path import basename from os.path import basename
import networkx as nx import networkx as nx
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
@@ -195,9 +195,6 @@ def loadGXL(filename):
labels = {} labels = {}
for attr in node.iter('attr'): for attr in node.iter('attr'):
labels[attr.attrib['name']] = attr[0].text labels[attr.attrib['name']] = attr[0].text
if 'chem' in labels:
labels['label'] = labels['chem']
labels['atom'] = labels['chem']
g.add_node(index, **labels) g.add_node(index, **labels)
index += 1 index += 1


@@ -205,11 +202,26 @@ def loadGXL(filename):
labels = {} labels = {}
for attr in edge.iter('attr'): for attr in edge.iter('attr'):
labels[attr.attrib['name']] = attr[0].text labels[attr.attrib['name']] = attr[0].text
if 'valence' in labels:
labels['label'] = labels['valence']
labels['bond_type'] = labels['valence']
g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels) g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels)
return g
# get label names.
label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []}
for node in root.iter('node'):
for attr in node.iter('attr'):
if attr[0].tag == 'int': # @todo: this maybe wrong, and slow.
label_names['node_labels'].append(attr.attrib['name'])
else:
label_names['node_attrs'].append(attr.attrib['name'])
break
for edge in root.iter('edge'):
for attr in edge.iter('attr'):
if attr[0].tag == 'int': # @todo: this maybe wrong, and slow.
label_names['edge_labels'].append(attr.attrib['name'])
else:
label_names['edge_attrs'].append(attr.attrib['name'])
break

return g, label_names




def saveGXL(graph, filename, method='default', node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): def saveGXL(graph, filename, method='default', node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]):
@@ -649,43 +661,49 @@ def loadFromXML(filename, dir_dataset=None):
for graph in root.iter('graph'): for graph in root.iter('graph'):
mol_filename = graph.attrib['file'] mol_filename = graph.attrib['file']
mol_class = graph.attrib['class'] mol_class = graph.attrib['class']
data.append(loadGXL(dir_dataset + '/' + mol_filename))
data.append(load_gxl(dir_dataset + '/' + mol_filename))
y.append(mol_class) y.append(mol_class)
return data, y return data, y


def loadFromDS(filename, filename_y):
def load_from_ds(filename, filename_targets):
"""Load data from .ds file. """Load data from .ds file.


Possible graph formats include: Possible graph formats include:


'.ct': see function loadCT for detail.
'.ct': see function load_ct for detail.


'.gxl': see dunction loadGXL for detail.
'.gxl': see dunction load_gxl for detail.


Note these graph formats are checked automatically by the extensions of Note these graph formats are checked automatically by the extensions of
graph files. graph files.
""" """
def append_label_names(label_names, new_names):
for key, val in label_names.items():
label_names[key] += [name for name in new_names[key] if name not in val]
dirname_dataset = dirname(filename) dirname_dataset = dirname(filename)
data = [] data = []
y = [] y = []
label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []}
content = open(filename).read().splitlines() content = open(filename).read().splitlines()
extension = splitext(content[0].split(' ')[0])[1][1:] extension = splitext(content[0].split(' ')[0])[1][1:]
if filename_y is None or filename_y == '':
if filename_targets is None or filename_targets == '':
if extension == 'ct': if extension == 'ct':
for i in range(0, len(content)): for i in range(0, len(content)):
tmp = content[i].split(' ') tmp = content[i].split(' ')
# remove the '#'s in file names # remove the '#'s in file names
data.append( data.append(
loadCT(dirname_dataset + '/' + tmp[0].replace('#', '', 1)))
load_ct(dirname_dataset + '/' + tmp[0].replace('#', '', 1)))
y.append(float(tmp[1])) y.append(float(tmp[1]))
elif extension == 'gxl': elif extension == 'gxl':
for i in range(0, len(content)): for i in range(0, len(content)):
tmp = content[i].split(' ') tmp = content[i].split(' ')
# remove the '#'s in file names # remove the '#'s in file names
data.append(
loadGXL(dirname_dataset + '/' + tmp[0].replace('#', '', 1)))
g, l_names = load_gxl(dirname_dataset + '/' + tmp[0].replace('#', '', 1))
data.append(g)
append_label_names(label_names, l_names)
y.append(float(tmp[1])) y.append(float(tmp[1]))
else: # y in a seperate file else: # y in a seperate file
if extension == 'ct': if extension == 'ct':
@@ -693,22 +711,23 @@ def loadFromDS(filename, filename_y):
tmp = content[i] tmp = content[i]
# remove the '#'s in file names # remove the '#'s in file names
data.append( data.append(
loadCT(dirname_dataset + '/' + tmp.replace('#', '', 1)))
load_ct(dirname_dataset + '/' + tmp.replace('#', '', 1)))
elif extension == 'gxl': elif extension == 'gxl':
for i in range(0, len(content)): for i in range(0, len(content)):
tmp = content[i] tmp = content[i]
# remove the '#'s in file names # remove the '#'s in file names
data.append(
loadGXL(dirname_dataset + '/' + tmp.replace('#', '', 1)))
g, l_names = load_gxl(dirname_dataset + '/' + tmp[0].replace('#', '', 1))
data.append(g)
append_label_names(label_names, l_names)
content_y = open(filename_y).read().splitlines()
# assume entries in filename and filename_y have the same order.
content_y = open(filename_targets).read().splitlines()
# assume entries in filename and filename_targets have the same order.
for item in content_y: for item in content_y:
tmp = item.split(' ') tmp = item.split(' ')
# assume the 3rd entry in a line is y (for Alkane dataset) # assume the 3rd entry in a line is y (for Alkane dataset)
y.append(float(tmp[2])) y.append(float(tmp[2]))
return data, y
return data, y, label_names
if __name__ == '__main__': if __name__ == '__main__':
@@ -727,13 +746,14 @@ if __name__ == '__main__':
# print(Gn[1].edges(data=True)) # print(Gn[1].edges(data=True))
# print(y[1]) # print(y[1])
# # .gxl file.
# ds = {'name': 'monoterpenoides',
# 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
# Gn, y = loadDataset(ds['dataset'])
# print(Gn[1].nodes(data=True))
# print(Gn[1].edges(data=True))
# print(y[1])
# .gxl file.
ds = {'name': 'monoterpenoides',
'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
Gn, y, label_names = load_dataset(ds['dataset'])
print(Gn[1].graph)
print(Gn[1].nodes(data=True))
print(Gn[1].edges(data=True))
print(y[1])
# ### Convert graph from one format to another. # ### Convert graph from one format to another.
# # .gxl file. # # .gxl file.
@@ -774,5 +794,5 @@ if __name__ == '__main__':
# dataset = '../../datasets/Letter-med/Letter-med_A.txt' # dataset = '../../datasets/Letter-med/Letter-med_A.txt'
# dataset = '../../datasets/AIDS/AIDS_A.txt' # dataset = '../../datasets/AIDS/AIDS_A.txt'
# dataset = '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' # dataset = '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'
# Gn, targets = load_dataset(filename)
# Gn, targets, label_names = load_dataset(filename)
pass pass

Loading…
Cancel
Save