Browse Source

1. add parallelization choice into the treelet kernel, including "imap_unordered" and None.

2. modify the treelet kernel, use tuples to store canonkeys instead of strings in case that some labels strings contain more than 1 character.
v0.1
jajupmochi 6 years ago
parent
commit
b7eef40edf
17 changed files with 7545 additions and 4008 deletions
  1. +4
    -0
      .gitignore
  2. +22
    -22
      notebooks/run_commonwalkkernel.py
  3. +18
    -20
      notebooks/run_marginalizedkernel.py
  4. +38
    -33
      notebooks/run_randomwalkkernel.py
  5. +29
    -29
      notebooks/run_spkernel.py
  6. +35
    -28
      notebooks/run_structuralspkernel.py
  7. +17
    -13
      notebooks/run_treeletkernel.py
  8. +18
    -20
      notebooks/run_untilhpathkernel.py
  9. +18
    -18
      notebooks/run_weisfeilerlehmankernel.py
  10. +111
    -114
      notebooks/utils/get_dataset_attributes.ipynb
  11. +6644
    -3316
      notebooks/utils/plot_all_graphs.ipynb
  12. +238
    -174
      preimage/gk_iam.py
  13. +39
    -28
      preimage/iam.py
  14. +181
    -117
      preimage/preimage.py
  15. +111
    -72
      pygraph/kernels/treeletKernel.py
  16. +19
    -1
      pygraph/utils/graphfiles.py
  17. +3
    -3
      pygraph/utils/model_selection_precomputed.py

+ 4
- 0
.gitignore View File

@@ -9,6 +9,10 @@ datasets/*
!datasets/MUTAG/ !datasets/MUTAG/
!datasets/Letter-med/ !datasets/Letter-med/
!datasets/ENZYMES_txt/ !datasets/ENZYMES_txt/
!datasets/DD/
!datasets/NCI1/
!datasets/NCI109/
!datasets/AIDS/
notebooks/results/* notebooks/results/*
notebooks/check_gm/* notebooks/check_gm/*
notebooks/test_parallel/* notebooks/test_parallel/*


+ 22
- 22
notebooks/run_commonwalkkernel.py View File

@@ -12,22 +12,25 @@ import multiprocessing
from pygraph.kernels.commonWalkKernel import commonwalkkernel from pygraph.kernels.commonWalkKernel import commonwalkkernel


dslist = [ dslist = [
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
'task': 'regression'}, # node symb
{'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'},
# contains single node graph, node symb
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb
{'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled
{'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# node nsymb
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# node symb/nsymb
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'task': 'regression'}, # node symb
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'},
# # contains single node graph, node symb
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# # node nsymb
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# # node symb/nsymb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
{'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
#
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb # # node/edge symb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb

# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb # # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
@@ -41,11 +44,6 @@ dslist = [


# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb


@@ -56,10 +54,12 @@ dslist = [
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
] ]
estimator = commonwalkkernel estimator = commonwalkkernel
#param_grid_precomputed = [{'compute_method': ['geo'],
# 'weight': np.linspace(0.01, 0.15, 15)},
## 'weight': np.logspace(-1, -10, num=10, base=10)},
# {'compute_method': ['exp'], 'weight': range(0, 15)}]
param_grid_precomputed = [{'compute_method': ['geo'], param_grid_precomputed = [{'compute_method': ['geo'],
'weight': np.linspace(0.01, 0.15, 15)},
# 'weight': np.logspace(-1, -10, num=10, base=10)},
{'compute_method': ['exp'], 'weight': range(0, 15)}]
'weight': np.linspace(0.01, 0.15, 15)}]
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}] {'alpha': np.logspace(-10, 10, num=41, base=10)}]




+ 18
- 20
notebooks/run_marginalizedkernel.py View File

@@ -12,22 +12,25 @@ import multiprocessing
from pygraph.kernels.marginalizedKernel import marginalizedkernel from pygraph.kernels.marginalizedKernel import marginalizedkernel


dslist = [ dslist = [
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
'task': 'regression'}, # node symb
{'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'},
# contains single node graph, node symb
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb
{'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled
{'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# node nsymb
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# node symb/nsymb
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'task': 'regression'}, # node symb
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'},
# # contains single node graph, node symb
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# # node nsymb
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# # node symb/nsymb
# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
{'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
#
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb # # node/edge symb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb

# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb # # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
@@ -41,11 +44,6 @@ dslist = [


# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb


@@ -59,7 +57,7 @@ estimator = marginalizedkernel
#param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.3, 3), #param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.3, 3),
# 'n_iteration': np.linspace(1, 1, 1), # 'n_iteration': np.linspace(1, 1, 1),
param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.9, 9), param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.9, 9),
'n_iteration': np.linspace(1, 19, 7),
'n_iteration': np.linspace(5, 20, 4),
'remove_totters': [False]} 'remove_totters': [False]}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}] {'alpha': np.logspace(-10, 10, num=41, base=10)}]


+ 38
- 33
notebooks/run_randomwalkkernel.py View File

@@ -17,22 +17,25 @@ import numpy as np




dslist = [ dslist = [
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
'task': 'regression'}, # node symb
{'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'},
# contains single node graph, node symb
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb
{'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled
{'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# node nsymb
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# node symb/nsymb
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'task': 'regression'}, # node symb
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'},
# # contains single node graph, node symb
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# # node symb/nsymb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
{'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# # node nsymb
#
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb # # node/edge symb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb

# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb # # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
@@ -40,22 +43,17 @@ dslist = [
# #
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb, missing values
# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb, missing values
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb


# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb


# # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
@@ -63,12 +61,25 @@ dslist = [
estimator = randomwalkkernel estimator = randomwalkkernel
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}] {'alpha': np.logspace(-10, 10, num=41, base=10)}]
gaussiankernel = functools.partial(gaussiankernel, gamma=0.5)


## for non-symbolic labels.
#gkernels = [functools.partial(gaussiankernel, gamma=1 / ga)
# for ga in np.logspace(0, 10, num=11, base=10)]
#mixkernels = [functools.partial(kernelproduct, deltakernel, gk) for gk in gkernels]
#sub_kernels = [{'symb': deltakernel, 'nsymb': gkernels[i], 'mix': mixkernels[i]}
# for i in range(len(gkernels))]

# for symbolic labels only.
#gaussiankernel = functools.partial(gaussiankernel, gamma=0.5)
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
sub_kernels = [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]


for ds in dslist: for ds in dslist:
print() print()
print(ds['name']) print(ds['name'])
for compute_method in ['sylvester', 'conjugate', 'fp', 'spectral']:
# for compute_method in ['sylvester', 'conjugate', 'fp', 'spectral']:
for compute_method in ['conjugate', 'fp']:
if compute_method == 'sylvester': if compute_method == 'sylvester':
param_grid_precomputed = {'compute_method': ['sylvester'], param_grid_precomputed = {'compute_method': ['sylvester'],
# 'weight': np.linspace(0.01, 0.10, 10)} # 'weight': np.linspace(0.01, 0.10, 10)}
@@ -76,18 +87,12 @@ for ds in dslist:
elif compute_method == 'conjugate': elif compute_method == 'conjugate':
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
param_grid_precomputed = {'compute_method': ['conjugate'], param_grid_precomputed = {'compute_method': ['conjugate'],
'node_kernels':
[{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}],
'edge_kernels':
[{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}],
'node_kernels': sub_kernels, 'edge_kernels': sub_kernels,
'weight': np.logspace(-1, -10, num=10, base=10)} 'weight': np.logspace(-1, -10, num=10, base=10)}
elif compute_method == 'fp': elif compute_method == 'fp':
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
param_grid_precomputed = {'compute_method': ['fp'], param_grid_precomputed = {'compute_method': ['fp'],
'node_kernels':
[{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}],
'edge_kernels':
[{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}],
'node_kernels': sub_kernels, 'edge_kernels': sub_kernels,
'weight': np.logspace(-3, -10, num=8, base=10)} 'weight': np.logspace(-3, -10, num=8, base=10)}
elif compute_method == 'spectral': elif compute_method == 'spectral':
param_grid_precomputed = {'compute_method': ['spectral'], param_grid_precomputed = {'compute_method': ['spectral'],


+ 29
- 29
notebooks/run_spkernel.py View File

@@ -8,41 +8,40 @@ from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct


# datasets # datasets
dslist = [ dslist = [
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
'task': 'regression'}, # node symb
{'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'},
# contains single node graph, node symb
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb
{'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled
{'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'task': 'regression'}, # node symb
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'},
# # contains single node graph, node symb
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# node nsymb # node nsymb
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# node symb/nsymb # node symb/nsymb
{'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
#
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb # # node/edge symb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
#
# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
# {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
# {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
# {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb
#
# {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb
# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
#
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb

# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb


# # not working below # # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
@@ -52,6 +51,7 @@ dslist = [
] ]
estimator = spkernel estimator = spkernel
# hyper-parameters # hyper-parameters
#gaussiankernel = functools.partial(gaussiankernel, gamma=0.5)
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
param_grid_precomputed = {'node_kernels': [ param_grid_precomputed = {'node_kernels': [
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]} {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}


+ 35
- 28
notebooks/run_structuralspkernel.py View File

@@ -14,22 +14,25 @@ from pygraph.kernels.structuralspKernel import structuralspkernel
from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct


dslist = [ dslist = [
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
'task': 'regression'}, # node symb
{'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'},
# contains single node graph, node symb
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb
{'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled
{'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# node nsymb
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# node symb/nsymb
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'task': 'regression'}, # node symb
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'},
# # contains single node graph, node symb
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# # node nsymb
# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# # node symb/nsymb
{'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
#
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb # # node/edge symb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb

# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb # # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
@@ -37,33 +40,37 @@ dslist = [
# #
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb, missing values
# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb, missing values
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb


# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb


# # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
] ]
estimator = structuralspkernel estimator = structuralspkernel

## for non-symbolic labels.
#gkernels = [functools.partial(gaussiankernel, gamma=1 / ga)
# for ga in np.logspace(0, 10, num=11, base=10)]
#mixkernels = [functools.partial(kernelproduct, deltakernel, gk) for gk in gkernels]
#sub_kernels = [{'symb': deltakernel, 'nsymb': gkernels[i], 'mix': mixkernels[i]}
# for i in range(len(gkernels))]

# for symbolic labels only.
#gaussiankernel = functools.partial(gaussiankernel, gamma=0.5)
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
param_grid_precomputed = {'node_kernels':
[{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}],
'edge_kernels':
[{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}],
'compute_method': ['naive']}
sub_kernels = [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]

param_grid_precomputed = {'node_kernels': sub_kernels, 'edge_kernels': sub_kernels,
'compute_method': ['naive']}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}] {'alpha': np.logspace(-10, 10, num=41, base=10)}]




+ 17
- 13
notebooks/run_treeletkernel.py View File

@@ -8,27 +8,31 @@ Created on Mon Mar 21 11:19:33 2019


from libs import * from libs import *
import multiprocessing import multiprocessing
import functools


from pygraph.kernels.treeletKernel import treeletkernel from pygraph.kernels.treeletKernel import treeletkernel
from pygraph.utils.kernels import gaussiankernel, linearkernel, polynomialkernel
from pygraph.utils.kernels import gaussiankernel, polynomialkernel


dslist = [ dslist = [
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
'task': 'regression'}, # node symb
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'task': 'regression'}, # node symb
{'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'}, 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'},
# contains single node graph, node symb # contains single node graph, node symb
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb
{'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled
{'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# # node nsymb
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# node symb/nsymb # node symb/nsymb
{'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
{'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
{'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# node nsymb
{'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
#
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb # # node/edge symb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb

# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb # # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
@@ -42,11 +46,6 @@ dslist = [


# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
{'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb


@@ -57,7 +56,12 @@ dslist = [
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
] ]
estimator = treeletkernel estimator = treeletkernel
param_grid_precomputed = {'sub_kernel': [gaussiankernel, linearkernel, polynomialkernel]}
gkernels = [functools.partial(gaussiankernel, gamma=1 / ga)
# for ga in np.linspace(1, 10, 10)]
for ga in np.logspace(0, 10, num=11, base=10)]
pkernels = [functools.partial(polynomialkernel, d=d, c=c) for d in range(1, 11)
for c in np.logspace(0, 10, num=11, base=10)]
param_grid_precomputed = {'sub_kernel': pkernels + gkernels}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}] {'alpha': np.logspace(-10, 10, num=41, base=10)}]




+ 18
- 20
notebooks/run_untilhpathkernel.py View File

@@ -12,22 +12,25 @@ import multiprocessing
from pygraph.kernels.untilHPathKernel import untilhpathkernel from pygraph.kernels.untilHPathKernel import untilhpathkernel


dslist = [ dslist = [
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
'task': 'regression'}, # node symb
{'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'},
# contains single node graph, node symb
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb
{'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled
{'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# node nsymb
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# node symb/nsymb
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'task': 'regression'}, # node symb
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'},
# # contains single node graph, node symb
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# # node nsymb
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# # node symb/nsymb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
{'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
#
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb # # node/edge symb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb

# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb # # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
@@ -41,11 +44,6 @@ dslist = [


# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb


@@ -57,7 +55,7 @@ dslist = [
] ]
estimator = untilhpathkernel estimator = untilhpathkernel
param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2], param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2],
'k_func': ['MinMax', 'tanimoto'],
'k_func': ['MinMax'], # ['MinMax', 'tanimoto'],
'compute_method': ['trie']} # ['MinMax']} 'compute_method': ['trie']} # ['MinMax']}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}] {'alpha': np.logspace(-10, 10, num=41, base=10)}]


+ 18
- 18
notebooks/run_weisfeilerlehmankernel.py View File

@@ -10,26 +10,29 @@ from libs import *
import multiprocessing import multiprocessing


from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel
from pygraph.utils.kernels import gaussiankernel, polynomialkernel




dslist = [ dslist = [
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
'task': 'regression'}, # node symb
{'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'},
# contains single node graph, node symb
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb
{'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled
{'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# node nsymb
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# node symb/nsymb
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'task': 'regression'}, # node symb
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt'},
# # contains single node graph, node symb
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds'}, # node/edge symb
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds'}, # unlabeled
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# # node nsymb
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# # node symb/nsymb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
{'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
#
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb # # node/edge symb
{'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
#
# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb # # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
@@ -43,9 +46,6 @@ dslist = [


# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
{'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
{'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb




+ 111
- 114
notebooks/utils/get_dataset_attributes.ipynb View File

@@ -13,7 +13,7 @@
"text": [ "text": [
"\n", "\n",
"Acyclic:\n", "Acyclic:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n", "node_labeled : True\n",
"edge_labeled : False\n", "edge_labeled : False\n",
"is_directed : False\n", "is_directed : False\n",
@@ -38,7 +38,7 @@
"\n", "\n",
"\n", "\n",
"Alkane:\n", "Alkane:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : False\n", "node_labeled : False\n",
"edge_labeled : False\n", "edge_labeled : False\n",
"is_directed : False\n", "is_directed : False\n",
@@ -63,7 +63,7 @@
"\n", "\n",
"\n", "\n",
"MAO:\n", "MAO:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n", "node_labeled : True\n",
"edge_labeled : True\n", "edge_labeled : True\n",
"is_directed : False\n", "is_directed : False\n",
@@ -88,7 +88,7 @@
"\n", "\n",
"\n", "\n",
"PAH:\n", "PAH:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : False\n", "node_labeled : False\n",
"edge_labeled : False\n", "edge_labeled : False\n",
"is_directed : False\n", "is_directed : False\n",
@@ -113,7 +113,7 @@
"\n", "\n",
"\n", "\n",
"MUTAG:\n", "MUTAG:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n", "node_labeled : True\n",
"edge_labeled : True\n", "edge_labeled : True\n",
"is_directed : False\n", "is_directed : False\n",
@@ -131,14 +131,14 @@
"min_fill_factor : 0.039540816326530615\n", "min_fill_factor : 0.039540816326530615\n",
"max_fill_factor : 0.1\n", "max_fill_factor : 0.1\n",
"node_label_num : 7\n", "node_label_num : 7\n",
"edge_label_num : 11\n",
"edge_label_num : 4\n",
"node_attr_dim : 0\n", "node_attr_dim : 0\n",
"edge_attr_dim : 0\n", "edge_attr_dim : 0\n",
"class_number : 2\n", "class_number : 2\n",
"\n", "\n",
"\n", "\n",
"Letter-med:\n", "Letter-med:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : False\n", "node_labeled : False\n",
"edge_labeled : False\n", "edge_labeled : False\n",
"is_directed : False\n", "is_directed : False\n",
@@ -163,7 +163,7 @@
"\n", "\n",
"\n", "\n",
"ENZYMES:\n", "ENZYMES:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n", "node_labeled : True\n",
"edge_labeled : False\n", "edge_labeled : False\n",
"is_directed : False\n", "is_directed : False\n",
@@ -187,33 +187,8 @@
"class_number : 6\n", "class_number : 6\n",
"\n", "\n",
"\n", "\n",
"Mutagenicity:\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : True\n",
"is_directed : False\n",
"dataset_size : 4337\n",
"ave_node_num : 30.317731150564907\n",
"min_node_num : 4\n",
"max_node_num : 417\n",
"ave_edge_num : 30.76942587041734\n",
"min_edge_num : 3\n",
"max_edge_num : 112\n",
"ave_node_degree : 2.0379886162441148\n",
"min_node_degree : 0.47961630695443647\n",
"max_node_degree : 2.3703703703703702\n",
"ave_fill_factor : 0.0431047931997047\n",
"min_fill_factor : 0.0005750795047415305\n",
"max_fill_factor : 0.1875\n",
"node_label_num : 14\n",
"edge_label_num : 3\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"D&D:\n", "D&D:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n", "node_labeled : True\n",
"edge_labeled : False\n", "edge_labeled : False\n",
"is_directed : False\n", "is_directed : False\n",
@@ -237,8 +212,58 @@
"class_number : 2\n", "class_number : 2\n",
"\n", "\n",
"\n", "\n",
"NCI1:\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 4110\n",
"ave_node_num : 29.8654501216545\n",
"min_node_num : 3\n",
"max_node_num : 111\n",
"ave_edge_num : 32.3\n",
"min_edge_num : 2\n",
"max_edge_num : 119\n",
"ave_node_degree : 2.155013792267071\n",
"min_node_degree : 0.8\n",
"max_node_degree : 2.769230769230769\n",
"ave_fill_factor : 0.04239828192835043\n",
"min_fill_factor : 0.009522961908152367\n",
"max_fill_factor : 0.2222222222222222\n",
"node_label_num : 37\n",
"edge_label_num : 0\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"NCI109:\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 4127\n",
"ave_node_num : 29.681124303368065\n",
"min_node_num : 4\n",
"max_node_num : 111\n",
"ave_edge_num : 32.13084565059365\n",
"min_edge_num : 3\n",
"max_edge_num : 119\n",
"ave_node_degree : 2.156446168619097\n",
"min_node_degree : 1.0909090909090908\n",
"max_node_degree : 2.769230769230769\n",
"ave_fill_factor : 0.04263668408405519\n",
"min_fill_factor : 0.009522961908152367\n",
"max_fill_factor : 0.1875\n",
"node_label_num : 38\n",
"edge_label_num : 0\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"AIDS:\n", "AIDS:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n", "node_labeled : True\n",
"edge_labeled : True\n", "edge_labeled : True\n",
"is_directed : False\n", "is_directed : False\n",
@@ -262,6 +287,31 @@
"class_number : 2\n", "class_number : 2\n",
"\n", "\n",
"\n", "\n",
"Mutagenicity:\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : True\n",
"is_directed : False\n",
"dataset_size : 4337\n",
"ave_node_num : 30.317731150564907\n",
"min_node_num : 4\n",
"max_node_num : 417\n",
"ave_edge_num : 30.76942587041734\n",
"min_edge_num : 3\n",
"max_edge_num : 112\n",
"ave_node_degree : 2.0379886162441148\n",
"min_node_degree : 0.47961630695443647\n",
"max_node_degree : 2.3703703703703702\n",
"ave_fill_factor : 0.0431047931997047\n",
"min_fill_factor : 0.0005750795047415305\n",
"max_fill_factor : 0.1875\n",
"node_label_num : 14\n",
"edge_label_num : 3\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"FIRSTMM_DB:\n", "FIRSTMM_DB:\n",
"substructures : {'non linear'}\n", "substructures : {'non linear'}\n",
"node_labeled : True\n", "node_labeled : True\n",
@@ -288,7 +338,7 @@
"\n", "\n",
"\n", "\n",
"MSRC9:\n", "MSRC9:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n", "node_labeled : True\n",
"edge_labeled : False\n", "edge_labeled : False\n",
"is_directed : False\n", "is_directed : False\n",
@@ -313,7 +363,7 @@
"\n", "\n",
"\n", "\n",
"MSRC21:\n", "MSRC21:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n", "node_labeled : True\n",
"edge_labeled : False\n", "edge_labeled : False\n",
"is_directed : False\n", "is_directed : False\n",
@@ -335,10 +385,16 @@
"node_attr_dim : 0\n", "node_attr_dim : 0\n",
"edge_attr_dim : 0\n", "edge_attr_dim : 0\n",
"class_number : 20\n", "class_number : 20\n",
"\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n", "\n",
"SYNTHETIC:\n", "SYNTHETIC:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n", "node_labeled : True\n",
"edge_labeled : False\n", "edge_labeled : False\n",
"is_directed : False\n", "is_directed : False\n",
@@ -363,7 +419,7 @@
"\n", "\n",
"\n", "\n",
"BZR:\n", "BZR:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n", "node_labeled : True\n",
"edge_labeled : False\n", "edge_labeled : False\n",
"is_directed : False\n", "is_directed : False\n",
@@ -385,16 +441,10 @@
"node_attr_dim : 3\n", "node_attr_dim : 3\n",
"edge_attr_dim : 0\n", "edge_attr_dim : 0\n",
"class_number : 2\n", "class_number : 2\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n", "\n",
"COX2:\n", "COX2:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n", "node_labeled : True\n",
"edge_labeled : False\n", "edge_labeled : False\n",
"is_directed : False\n", "is_directed : False\n",
@@ -419,7 +469,7 @@
"\n", "\n",
"\n", "\n",
"DHFR:\n", "DHFR:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n", "node_labeled : True\n",
"edge_labeled : False\n", "edge_labeled : False\n",
"is_directed : False\n", "is_directed : False\n",
@@ -444,7 +494,7 @@
"\n", "\n",
"\n", "\n",
"PROTEINS:\n", "PROTEINS:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n", "node_labeled : True\n",
"edge_labeled : False\n", "edge_labeled : False\n",
"is_directed : False\n", "is_directed : False\n",
@@ -469,7 +519,7 @@
"\n", "\n",
"\n", "\n",
"PROTEINS_full:\n", "PROTEINS_full:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n", "node_labeled : True\n",
"edge_labeled : False\n", "edge_labeled : False\n",
"is_directed : False\n", "is_directed : False\n",
@@ -492,61 +542,11 @@
"edge_attr_dim : 0\n", "edge_attr_dim : 0\n",
"class_number : 2\n", "class_number : 2\n",
"\n", "\n",
"\n",
"NCI1:\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 4110\n",
"ave_node_num : 29.8654501216545\n",
"min_node_num : 3\n",
"max_node_num : 111\n",
"ave_edge_num : 32.3\n",
"min_edge_num : 2\n",
"max_edge_num : 119\n",
"ave_node_degree : 2.155013792267071\n",
"min_node_degree : 0.8\n",
"max_node_degree : 2.769230769230769\n",
"ave_fill_factor : 0.04239828192835043\n",
"min_fill_factor : 0.009522961908152367\n",
"max_fill_factor : 0.2222222222222222\n",
"node_label_num : 37\n",
"edge_label_num : 0\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"NCI109:\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 4127\n",
"ave_node_num : 29.681124303368065\n",
"min_node_num : 4\n",
"max_node_num : 111\n",
"ave_edge_num : 32.13084565059365\n",
"min_edge_num : 3\n",
"max_edge_num : 119\n",
"ave_node_degree : 2.156446168619097\n",
"min_node_degree : 1.0909090909090908\n",
"max_node_degree : 2.769230769230769\n",
"ave_fill_factor : 0.04263668408405519\n",
"min_fill_factor : 0.009522961908152367\n",
"max_fill_factor : 0.1875\n",
"node_label_num : 38\n",
"edge_label_num : 0\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"load SDF: 100%|██████████| 4457424/4457424 [00:08<00:00, 497346.72it/s]\n",
"ajust data: 100%|██████████| 42687/42687 [00:09<00:00, 4689.76it/s] \n",
"load SDF: 100%|██████████| 4457424/4457424 [00:09<00:00, 489414.03it/s]\n",
"ajust data: 100%|██████████| 42687/42687 [00:09<00:00, 4562.13it/s] \n",
"\n", "\n",
"NCI-HIV:\n", "NCI-HIV:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n", "node_labeled : True\n",
"edge_labeled : True\n", "edge_labeled : True\n",
"is_directed : False\n", "is_directed : False\n",
@@ -584,14 +584,15 @@
" 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt',},\n", " 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt',},\n",
" {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds',},\n", " {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds',},\n",
" {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds',},\n", " {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds',},\n",
" {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat',\n",
" 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}},\n",
" {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG_A.txt'},\n",
" {'name': 'Letter-med', 'dataset': '../../datasets/Letter-med/Letter-med_A.txt'},\n", " {'name': 'Letter-med', 'dataset': '../../datasets/Letter-med/Letter-med_A.txt'},\n",
" {'name': 'ENZYMES', 'dataset': '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", " {'name': 'ENZYMES', 'dataset': '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n",
" {'name': 'Mutagenicity', 'dataset': '../../datasets/Mutagenicity/Mutagenicity_A.txt'},\n",
" {'name': 'D&D', 'dataset': '../../datasets/D&D/DD.mat',\n",
" 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}},\n",
" {'name': 'D&D', 'dataset': '../../datasets/DD/DD_A.txt'},\n",
" {'name': 'NCI1', 'dataset': '../../datasets/NCI1/NCI1_A.txt'},\n",
" {'name': 'NCI109', 'dataset': '../../datasets/NCI109/NCI109_A.txt'},\n",
" {'name': 'AIDS', 'dataset': '../../datasets/AIDS/AIDS_A.txt'},\n", " {'name': 'AIDS', 'dataset': '../../datasets/AIDS/AIDS_A.txt'},\n",
" \n",
" {'name': 'Mutagenicity', 'dataset': '../../datasets/Mutagenicity/Mutagenicity_A.txt'},\n",
" {'name': 'FIRSTMM_DB', 'dataset': '../../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'},\n", " {'name': 'FIRSTMM_DB', 'dataset': '../../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'},\n",
" {'name': 'MSRC9', 'dataset': '../../datasets/MSRC_9_txt/MSRC_9_A.txt'},\n", " {'name': 'MSRC9', 'dataset': '../../datasets/MSRC_9_txt/MSRC_9_A.txt'},\n",
" {'name': 'MSRC21', 'dataset': '../../datasets/MSRC_21_txt/MSRC_21_A.txt'},\n", " {'name': 'MSRC21', 'dataset': '../../datasets/MSRC_21_txt/MSRC_21_A.txt'},\n",
@@ -601,10 +602,6 @@
" {'name': 'DHFR', 'dataset': '../../datasets/DHFR_txt/DHFR_A_sparse.txt'}, \n", " {'name': 'DHFR', 'dataset': '../../datasets/DHFR_txt/DHFR_A_sparse.txt'}, \n",
" {'name': 'PROTEINS', 'dataset': '../../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'},\n", " {'name': 'PROTEINS', 'dataset': '../../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'},\n",
" {'name': 'PROTEINS_full', 'dataset': '../../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, \n", " {'name': 'PROTEINS_full', 'dataset': '../../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, \n",
" {'name': 'NCI1', 'dataset': '../../datasets/NCI1/NCI1.mat',\n",
" 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}},\n",
" {'name': 'NCI109', 'dataset': '../../datasets/NCI109/NCI109.mat',\n",
" 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}},\n",
" {'name': 'NCI-HIV', 'dataset': '../../datasets/NCI-HIV/AIDO99SD.sdf',\n", " {'name': 'NCI-HIV', 'dataset': '../../datasets/NCI-HIV/AIDO99SD.sdf',\n",
" 'dataset_y': '../../datasets/NCI-HIV/aids_conc_may04.txt',},\n", " 'dataset_y': '../../datasets/NCI-HIV/aids_conc_may04.txt',},\n",
"\n", "\n",
@@ -646,7 +643,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.6.7"
"version": "3.6.8"
} }
}, },
"nbformat": 4, "nbformat": 4,


+ 6644
- 3316
notebooks/utils/plot_all_graphs.ipynb
File diff suppressed because it is too large
View File


+ 238
- 174
preimage/gk_iam.py View File

@@ -22,6 +22,11 @@ from iam import iam, test_iam_with_more_graphs_as_init, test_iam_moreGraphsAsIni
sys.path.insert(0, "../") sys.path.insert(0, "../")
from pygraph.kernels.marginalizedKernel import marginalizedkernel from pygraph.kernels.marginalizedKernel import marginalizedkernel
from pygraph.kernels.untilHPathKernel import untilhpathkernel from pygraph.kernels.untilHPathKernel import untilhpathkernel
from pygraph.kernels.spKernel import spkernel
import functools
from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct
from pygraph.kernels.structuralspKernel import structuralspkernel
from median import draw_Letter_graph




def gk_iam(Gn, alpha): def gk_iam(Gn, alpha):
@@ -119,6 +124,8 @@ def gk_iam_nearest(Gn, alpha, idx_gi, Kmatrix, k, r_max):
for gi in Gk: for gi in Gk:
nx.draw_networkx(gi) nx.draw_networkx(gi)
plt.show() plt.show()
print(gi.nodes(data=True))
print(gi.edges(data=True))
Gs_nearest = Gk.copy() Gs_nearest = Gk.copy()
# gihat_list = [] # gihat_list = []
@@ -132,6 +139,8 @@ def gk_iam_nearest(Gn, alpha, idx_gi, Kmatrix, k, r_max):
g_tmp = test_iam_with_more_graphs_as_init(Gs_nearest, Gs_nearest, c_ei=1, c_er=1, c_es=1) g_tmp = test_iam_with_more_graphs_as_init(Gs_nearest, Gs_nearest, c_ei=1, c_er=1, c_es=1)
nx.draw_networkx(g_tmp) nx.draw_networkx(g_tmp)
plt.show() plt.show()
print(g_tmp.nodes(data=True))
print(g_tmp.edges(data=True))
# compute distance between phi and the new generated graph. # compute distance between phi and the new generated graph.
gi_list = [Gn[i] for i in idx_gi] gi_list = [Gn[i] for i in idx_gi]
@@ -166,28 +175,249 @@ def gk_iam_nearest(Gn, alpha, idx_gi, Kmatrix, k, r_max):
return dhat, ghat return dhat, ghat




def dis_gstar(idx_g, idx_gi, alpha, Kmatrix):
#def gk_iam_nearest_multi(Gn, alpha, idx_gi, Kmatrix, k, r_max):
# """This function constructs graph pre-image by the iterative pre-image
# framework in reference [1], algorithm 1, where the step of generating new
# graphs randomly is replaced by the IAM algorithm in reference [2].
#
# notes
# -----
# Every time a set of n better graphs is acquired, their distances in kernel space are
# compared with the k nearest ones, and the k nearest distances from the k+n
# distances will be used as the new ones.
# """
# Gn_median = [Gn[idx].copy() for idx in idx_gi]
# # compute k nearest neighbors of phi in DN.
# dis_list = [] # distance between g_star and each graph.
# for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout):
# dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix)
## dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) *
## k_g2_list[ig]) + (alpha * alpha * k_list[0] + alpha *
## (1 - alpha) * k_g2_list[0] + (1 - alpha) * alpha *
## k_g1_list[6] + (1 - alpha) * (1 - alpha) * k_list[6])
# dis_list.append(dtemp)
#
# # sort
# sort_idx = np.argsort(dis_list)
# dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances
# nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
# g0hat_list = [Gn[idx] for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN
# if dis_gs[0] == 0: # the exact pre-image.
# print('The exact pre-image is found from the input dataset.')
# return 0, g0hat_list
# dhat = dis_gs[0] # the nearest distance
# ghat_list = [g.copy() for g in g0hat_list]
# for g in ghat_list:
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# Gk = [Gn[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors
# for gi in Gk:
# nx.draw_networkx(gi)
# plt.show()
# print(gi.nodes(data=True))
# print(gi.edges(data=True))
# Gs_nearest = Gk.copy()
## gihat_list = []
#
## i = 1
# r = 1
# while r < r_max:
# print('r =', r)
## found = False
## Gs_nearest = Gk + gihat_list
## g_tmp = iam(Gs_nearest)
# g_tmp_list = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
# Gn_median, Gs_nearest, c_ei=1, c_er=1, c_es=1)
# for g in g_tmp_list:
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
#
# # compute distance between phi and the new generated graphs.
# gi_list = [Gn[i] for i in idx_gi]
# knew = compute_kernel(g_tmp_list + gi_list, 'marginalizedkernel', False)
# dnew_list = []
# for idx, g_tmp in enumerate(g_tmp_list):
# dnew_list.append(dis_gstar(idx, range(len(g_tmp_list),
# len(g_tmp_list) + len(gi_list) + 1), alpha, knew))
#
## dnew = knew[0, 0] - 2 * (alpha[0] * knew[0, 1] + alpha[1] *
## knew[0, 2]) + (alpha[0] * alpha[0] * k_list[0] + alpha[0] *
## alpha[1] * k_g2_list[0] + alpha[1] * alpha[0] *
## k_g1_list[1] + alpha[1] * alpha[1] * k_list[1])
#
# # find the new k nearest graphs.
# dis_gs = dnew_list + dis_gs # add the new nearest distances.
# Gs_nearest = [g.copy() for g in g_tmp_list] + Gs_nearest # add the corresponding graphs.
# sort_idx = np.argsort(dis_gs)
# if len([i for i in sort_idx[0:k] if i < len(dnew_list)]) > 0:
# print('We got better k nearest neighbors! Hurray!')
# dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
# print(dis_gs[-1])
# Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
# nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
# if len([i for i in sort_idx[0:nb_best] if i < len(dnew_list)]) > 0:
# print('I have smaller or equal distance!')
# dhat = dis_gs[0]
# print(str(dhat) + '->' + str(dhat))
# idx_best_list = np.argwhere(dnew_list == dhat).flatten().tolist()
# ghat_list = [g_tmp_list[idx].copy() for idx in idx_best_list]
# for g in ghat_list:
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# r = 0
# else:
# r += 1
#
# return dhat, ghat_list


def gk_iam_nearest_multi(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max, gkernel):
"""This function constructs graph pre-image by the iterative pre-image
framework in reference [1], algorithm 1, where the step of generating new
graphs randomly is replaced by the IAM algorithm in reference [2].
notes
-----
Every time a set of n better graphs is acquired, their distances in kernel space are
compared with the k nearest ones, and the k nearest distances from the k+n
distances will be used as the new ones.
"""
# compute k nearest neighbors of phi in DN.
dis_list = [] # distance between g_star and each graph.
term3 = 0
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
# dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) *
# k_g2_list[ig]) + (alpha * alpha * k_list[0] + alpha *
# (1 - alpha) * k_g2_list[0] + (1 - alpha) * alpha *
# k_g1_list[6] + (1 - alpha) * (1 - alpha) * k_list[6])
dis_list.append(dtemp)
# sort
sort_idx = np.argsort(dis_list)
dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances
nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
g0hat_list = [Gn_init[idx] for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN
if dis_gs[0] == 0: # the exact pre-image.
print('The exact pre-image is found from the input dataset.')
return 0, g0hat_list
dhat = dis_gs[0] # the nearest distance
ghat_list = [g.copy() for g in g0hat_list]
for g in ghat_list:
draw_Letter_graph(g)
# nx.draw_networkx(g)
# plt.show()
print(g.nodes(data=True))
print(g.edges(data=True))
Gk = [Gn_init[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors
for gi in Gk:
# nx.draw_networkx(gi)
# plt.show()
draw_Letter_graph(g)
print(gi.nodes(data=True))
print(gi.edges(data=True))
Gs_nearest = Gk.copy()
# gihat_list = []
# i = 1
r = 1
while r < r_max:
print('r =', r)
# found = False
# Gs_nearest = Gk + gihat_list
# g_tmp = iam(Gs_nearest)
g_tmp_list = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
Gn_median, Gs_nearest, c_ei=1, c_er=1, c_es=1)
for g in g_tmp_list:
# nx.draw_networkx(g)
# plt.show()
draw_Letter_graph(g)
print(g.nodes(data=True))
print(g.edges(data=True))
# compute distance between phi and the new generated graphs.
knew = compute_kernel(g_tmp_list + Gn_median, gkernel, False)
dnew_list = []
for idx, g_tmp in enumerate(g_tmp_list):
dnew_list.append(dis_gstar(idx, range(len(g_tmp_list),
len(g_tmp_list) + len(Gn_median) + 1), alpha, knew,
withterm3=False))
# dnew = knew[0, 0] - 2 * (alpha[0] * knew[0, 1] + alpha[1] *
# knew[0, 2]) + (alpha[0] * alpha[0] * k_list[0] + alpha[0] *
# alpha[1] * k_g2_list[0] + alpha[1] * alpha[0] *
# k_g1_list[1] + alpha[1] * alpha[1] * k_list[1])
# find the new k nearest graphs.
dis_gs = dnew_list + dis_gs # add the new nearest distances.
Gs_nearest = [g.copy() for g in g_tmp_list] + Gs_nearest # add the corresponding graphs.
sort_idx = np.argsort(dis_gs)
if len([i for i in sort_idx[0:k] if i < len(dnew_list)]) > 0:
print('We got better k nearest neighbors! Hurray!')
dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
print(dis_gs[-1])
Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
if len([i for i in sort_idx[0:nb_best] if i < len(dnew_list)]) > 0:
print('I have smaller or equal distance!')
print(str(dhat) + '->' + str(dis_gs[0]))
dhat = dis_gs[0]
idx_best_list = np.argwhere(dnew_list == dhat).flatten().tolist()
ghat_list = [g_tmp_list[idx].copy() for idx in idx_best_list]
for g in ghat_list:
# nx.draw_networkx(g)
# plt.show()
draw_Letter_graph(g)
print(g.nodes(data=True))
print(g.edges(data=True))
r = 0
else:
r += 1
return dhat, ghat_list


def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True):
term1 = Kmatrix[idx_g, idx_g] term1 = Kmatrix[idx_g, idx_g]
term2 = 0 term2 = 0
for i, a in enumerate(alpha): for i, a in enumerate(alpha):
term2 += a * Kmatrix[idx_g, idx_gi[i]] term2 += a * Kmatrix[idx_g, idx_gi[i]]
term2 *= 2 term2 *= 2
term3 = 0
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
if withterm3 == False:
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
return np.sqrt(term1 - term2 + term3) return np.sqrt(term1 - term2 + term3)




def compute_kernel(Gn, graph_kernel, verbose): def compute_kernel(Gn, graph_kernel, verbose):
if graph_kernel == 'marginalizedkernel': if graph_kernel == 'marginalizedkernel':
Kmatrix, _ = marginalizedkernel(Gn, node_label='atom', edge_label=None, Kmatrix, _ = marginalizedkernel(Gn, node_label='atom', edge_label=None,
p_quit=0.3, n_iteration=19, remove_totters=False,
p_quit=0.03, n_iteration=20, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=verbose) n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'untilhpathkernel': elif graph_kernel == 'untilhpathkernel':
Kmatrix, _ = untilhpathkernel(Gn, node_label='atom', edge_label='bond_type', Kmatrix, _ = untilhpathkernel(Gn, node_label='atom', edge_label='bond_type',
depth=2, k_func='MinMax', compute_method='trie',
depth=10, k_func='MinMax', compute_method='trie',
n_jobs=multiprocessing.cpu_count(), verbose=verbose) n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'spkernel':
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
Kmatrix, _, _ = spkernel(Gn, node_label='atom', node_kernels=
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel},
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'structuralspkernel':
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
Kmatrix, _ = structuralspkernel(Gn, node_label='atom', node_kernels=
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel},
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
# normalization # normalization
Kmatrix_diag = Kmatrix.diagonal().copy() Kmatrix_diag = Kmatrix.diagonal().copy()
@@ -204,170 +434,4 @@ def gram2distances(Kmatrix):
for i2 in range(len(Kmatrix)): for i2 in range(len(Kmatrix)):
dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2] dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2]
dmatrix = np.sqrt(dmatrix) dmatrix = np.sqrt(dmatrix)
return dmatrix

# --------------------------- These are tests --------------------------------#
def test_who_is_the_closest_in_kernel_space(Gn):
idx_gi = [0, 6]
g1 = Gn[idx_gi[0]]
g2 = Gn[idx_gi[1]]
# create the "median" graph.
gnew = g2.copy()
gnew.remove_node(0)
nx.draw_networkx(gnew)
plt.show()
print(gnew.nodes(data=True))
Gn = [gnew] + Gn
# compute gram matrix
Kmatrix = compute_kernel(Gn, 'untilhpathkernel', True)
# the distance matrix
dmatrix = gram2distances(Kmatrix)
print(np.sort(dmatrix[idx_gi[0] + 1]))
print(np.argsort(dmatrix[idx_gi[0] + 1]))
print(np.sort(dmatrix[idx_gi[1] + 1]))
print(np.argsort(dmatrix[idx_gi[1] + 1]))
# for all g in Gn, compute (d(g1, g) + d(g2, g)) / 2
dis_median = [(dmatrix[i, idx_gi[0] + 1] + dmatrix[i, idx_gi[1] + 1]) / 2 for i in range(len(Gn))]
print(np.sort(dis_median))
print(np.argsort(dis_median))
return


def test_who_is_the_closest_in_GED_space(Gn):
from iam import GED
idx_gi = [0, 6]
g1 = Gn[idx_gi[0]]
g2 = Gn[idx_gi[1]]
# create the "median" graph.
gnew = g2.copy()
gnew.remove_node(0)
nx.draw_networkx(gnew)
plt.show()
print(gnew.nodes(data=True))
Gn = [gnew] + Gn
# compute GEDs
ged_matrix = np.zeros((len(Gn), len(Gn)))
for i1 in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
for i2 in range(len(Gn)):
dis, _, _ = GED(Gn[i1], Gn[i2], lib='gedlib')
ged_matrix[i1, i2] = dis
print(np.sort(ged_matrix[idx_gi[0] + 1]))
print(np.argsort(ged_matrix[idx_gi[0] + 1]))
print(np.sort(ged_matrix[idx_gi[1] + 1]))
print(np.argsort(ged_matrix[idx_gi[1] + 1]))
# for all g in Gn, compute (GED(g1, g) + GED(g2, g)) / 2
dis_median = [(ged_matrix[i, idx_gi[0] + 1] + ged_matrix[i, idx_gi[1] + 1]) / 2 for i in range(len(Gn))]
print(np.sort(dis_median))
print(np.argsort(dis_median))
return


def test_will_IAM_give_the_median_graph_we_wanted(Gn):
idx_gi = [0, 6]
g1 = Gn[idx_gi[0]].copy()
g2 = Gn[idx_gi[1]].copy()
# del Gn[idx_gi[0]]
# del Gn[idx_gi[1] - 1]
g_median = test_iam_with_more_graphs_as_init([g1, g2], [g1, g2], c_ei=1, c_er=1, c_es=1)
# g_median = test_iam_with_more_graphs_as_init(Gn, Gn, c_ei=1, c_er=1, c_es=1)
nx.draw_networkx(g_median)
plt.show()
print(g_median.nodes(data=True))
print(g_median.edges(data=True))
def test_new_IAM_allGraph_deleteNodes(Gn):
idx_gi = [0, 6]
# g1 = Gn[idx_gi[0]].copy()
# g2 = Gn[idx_gi[1]].copy()

g1 = nx.Graph(name='haha')
g1.add_nodes_from([(2, {'atom': 'C'}), (3, {'atom': 'O'}), (4, {'atom': 'C'})])
g1.add_edges_from([(2, 3, {'bond_type': '1'}), (3, 4, {'bond_type': '1'})])
g2 = nx.Graph(name='hahaha')
g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'}),
(3, {'atom': 'O'}), (4, {'atom': 'C'})])
g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
(2, 3, {'bond_type': '1'}), (3, 4, {'bond_type': '1'})])
# g2 = g1.copy()
# g2.add_nodes_from([(3, {'atom': 'O'})])
# g2.add_nodes_from([(4, {'atom': 'C'})])
# g2.add_edges_from([(1, 3, {'bond_type': '1'})])
# g2.add_edges_from([(3, 4, {'bond_type': '1'})])

# del Gn[idx_gi[0]]
# del Gn[idx_gi[1] - 1]
g_median = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations([g1, g2], [g1, g2], c_ei=1, c_er=1, c_es=1)
# g_median = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(Gn, Gn, c_ei=1, c_er=1, c_es=1)
nx.draw_networkx(g_median)
plt.show()
print(g_median.nodes(data=True))
print(g_median.edges(data=True))


if __name__ == '__main__':
from pygraph.utils.graphfiles import loadDataset
# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb
# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
# 'extra_params': {}} # node nsymb
# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
# 'extra_params': {}}
ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
'extra_params': {}} # node symb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# Gn = Gn[0:20]
test_new_IAM_allGraph_deleteNodes(Gn)
test_will_IAM_give_the_median_graph_we_wanted(Gn)
test_who_is_the_closest_in_GED_space(Gn)
test_who_is_the_closest_in_kernel_space(Gn)
lmbda = 0.03 # termination probalility
r_max = 10 # recursions
l = 500
alpha_range = np.linspace(0.5, 0.5, 1)
k = 20 # k nearest neighbors
# randomly select two molecules
np.random.seed(1)
idx_gi = [0, 6] # np.random.randint(0, len(Gn), 2)
g1 = Gn[idx_gi[0]]
g2 = Gn[idx_gi[1]]
# g_tmp = iam([g1, g2])
# nx.draw_networkx(g_tmp)
# plt.show()
# compute
# k_list = [] # kernel between each graph and itself.
# k_g1_list = [] # kernel between each graph and g1
# k_g2_list = [] # kernel between each graph and g2
# for ig, g in tqdm(enumerate(Gn), desc='computing self kernels', file=sys.stdout):
# ktemp = compute_kernel([g, g1, g2], 'marginalizedkernel', False)
# k_list.append(ktemp[0][0, 0])
# k_g1_list.append(ktemp[0][0, 1])
# k_g2_list.append(ktemp[0][0, 2])
km = compute_kernel(Gn, 'untilhpathkernel', True)
# k_list = np.diag(km) # kernel between each graph and itself.
# k_g1_list = km[idx_gi[0]] # kernel between each graph and g1
# k_g2_list = km[idx_gi[1]] # kernel between each graph and g2

g_best = []
dis_best = []
# for each alpha
for alpha in alpha_range:
print('alpha =', alpha)
dhat, ghat = gk_iam_nearest(Gn, [alpha, 1 - alpha], idx_gi, km, k, r_max)
dis_best.append(dhat)
g_best.append(ghat)
for idx, item in enumerate(alpha_range):
print('when alpha is', item, 'the shortest distance is', dis_best[idx])
print('the corresponding pre-image is')
nx.draw_networkx(g_best[idx])
plt.show()
return dmatrix

+ 39
- 28
preimage/iam.py View File

@@ -158,7 +158,7 @@ def GED(g1, g2, lib='gedlib'):
script.PyRestartEnv() script.PyRestartEnv()
script.PyLoadGXLGraph('ged_tmp/', 'ged_tmp/tmp.xml') script.PyLoadGXLGraph('ged_tmp/', 'ged_tmp/tmp.xml')
listID = script.PyGetGraphIds() listID = script.PyGetGraphIds()
script.PySetEditCost("CHEM_1")
script.PySetEditCost("LETTER") #("CHEM_1")
script.PyInitEnv() script.PyInitEnv()
script.PySetMethod("IPFP", "") script.PySetMethod("IPFP", "")
script.PyInitMethod() script.PyInitMethod()
@@ -168,7 +168,15 @@ def GED(g1, g2, lib='gedlib'):
pi_forward, pi_backward = script.PyGetAllMap(g, h) pi_forward, pi_backward = script.PyGetAllMap(g, h)
upper = script.PyGetUpperBound(g, h) upper = script.PyGetUpperBound(g, h)
lower = script.PyGetLowerBound(g, h) lower = script.PyGetLowerBound(g, h)
dis = (upper + lower) / 2
dis = upper
# make the map label correct (label remove map as np.inf)
nodes1 = [n for n in g1.nodes()]
nodes2 = [n for n in g2.nodes()]
nb1 = nx.number_of_nodes(g1)
nb2 = nx.number_of_nodes(g2)
pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
return dis, pi_forward, pi_backward return dis, pi_forward, pi_backward


@@ -319,7 +327,7 @@ def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
from tqdm import tqdm from tqdm import tqdm
# Gn_median = Gn_median[0:10] # Gn_median = Gn_median[0:10]
# Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median] # Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median]
node_ir = sys.maxsize * 2 # Max number for c++, corresponding to the node remove and insertion.
node_ir = np.inf # corresponding to the node remove and insertion.
label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable. label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable.
ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate, ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate,
attr_names=['edge_labeled', 'node_attr_dim'], attr_names=['edge_labeled', 'node_attr_dim'],
@@ -347,7 +355,7 @@ def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
h_i0 = 0 h_i0 = 0
for idx, g in enumerate(Gn_median): for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][ndi] pi_i = pi_p_forward[idx][ndi]
if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
if pi_i != node_ir and g.nodes[pi_i][node_label] == label:
h_i0 += 1 h_i0 += 1
h_i0_list.append(h_i0) h_i0_list.append(h_i0)
label_list.append(label) label_list.append(label)
@@ -364,7 +372,7 @@ def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
nlabel_best = [label_list[idx] for idx in idx_max] nlabel_best = [label_list[idx] for idx in idx_max]
# generate "best" graphs with regard to "best" node labels. # generate "best" graphs with regard to "best" node labels.
G_new_list_nd = [] G_new_list_nd = []
for g in G_new_list:
for g in G_new_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now.
for nl in nlabel_best: for nl in nlabel_best:
g_tmp = g.copy() g_tmp = g.copy()
if nl == label_r: if nl == label_r:
@@ -380,16 +388,16 @@ def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
G_new_list = G_new_list_nd[:] G_new_list = G_new_list_nd[:]


else: # labels are non-symbolic else: # labels are non-symbolic
for nd in G.nodes():
for ndi, (nd, _) in enumerate(G.nodes(data=True)):
Si_norm = 0 Si_norm = 0
phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
for idx, g in enumerate(Gn_median): for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][nd]
pi_i = pi_p_forward[idx][ndi]
if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
Si_norm += 1 Si_norm += 1
phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
phi_i_bar /= Si_norm phi_i_bar /= Si_norm
G_new.nodes[nd]['attributes'] = phi_i_bar
G_new_list[0].nodes[nd]['attributes'] = phi_i_bar
# update edge labels and adjacency matrix. # update edge labels and adjacency matrix.
if ds_attrs['edge_labeled']: if ds_attrs['edge_labeled']:
@@ -467,12 +475,12 @@ def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
# pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list] # pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list]
# G_new_list = [G_new_list[idx] for idx in idx_min_list] # G_new_list = [G_new_list[idx] for idx in idx_min_list]
for g in G_new_list:
import matplotlib.pyplot as plt
nx.draw_networkx(g)
plt.show()
print(g.nodes(data=True))
print(g.edges(data=True))
# for g in G_new_list:
# import matplotlib.pyplot as plt
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
return G_new_list, pi_forward_list return G_new_list, pi_forward_list
@@ -504,7 +512,7 @@ def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
G_list = [G] G_list = [G]
pi_forward_list = [pi_p_forward] pi_forward_list = [pi_p_forward]
# iterations. # iterations.
for itr in range(0, 10): # @todo: the convergence condition?
for itr in range(0, 5): # @todo: the convergence condition?
# print('itr is', itr) # print('itr is', itr)
G_new_list = [] G_new_list = []
pi_forward_new_list = [] pi_forward_new_list = []
@@ -562,7 +570,7 @@ def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
# phase 1: initilize. # phase 1: initilize.
# compute set-median. # compute set-median.
dis_min = np.inf dis_min = np.inf
dis_all, pi_all_forward = median_distance(Gn_candidate[::-1], Gn_median)
dis_all, pi_all_forward = median_distance(Gn_candidate, Gn_median)
# find all smallest distances. # find all smallest distances.
idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist() idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist()
dis_min = dis_all[idx_min_list[0]] dis_min = dis_all[idx_min_list[0]]
@@ -580,24 +588,27 @@ def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
G_list, _ = remove_duplicates(G_list) G_list, _ = remove_duplicates(G_list)
if connected == True: if connected == True:
G_list, _ = remove_disconnected(G_list)
G_list_con, _ = remove_disconnected(G_list)
# if there is no connected graphs at all, then remain the disconnected ones.
if len(G_list_con) > 0: # @todo: ??????????????????????????
G_list = G_list_con


import matplotlib.pyplot as plt
for g in G_list:
nx.draw_networkx(g)
plt.show()
print(g.nodes(data=True))
print(g.edges(data=True))
# import matplotlib.pyplot as plt
# for g in G_list:
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# get the best median graphs # get the best median graphs
dis_all, pi_all_forward = median_distance(G_list, Gn_median) dis_all, pi_all_forward = median_distance(G_list, Gn_median)
G_min_list, pi_forward_min_list, dis_min = best_median_graphs( G_min_list, pi_forward_min_list, dis_min = best_median_graphs(
G_list, dis_all, pi_all_forward) G_list, dis_all, pi_all_forward)
for g in G_min_list:
nx.draw_networkx(g)
plt.show()
print(g.nodes(data=True))
print(g.edges(data=True))
# for g in G_min_list:
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
return G_min_list return G_min_list






+ 181
- 117
preimage/preimage.py View File

@@ -9,6 +9,7 @@ pre-image


import sys import sys
import numpy as np import numpy as np
import random
import multiprocessing import multiprocessing
from tqdm import tqdm from tqdm import tqdm
import networkx as nx import networkx as nx
@@ -16,127 +17,190 @@ import matplotlib.pyplot as plt




sys.path.insert(0, "../") sys.path.insert(0, "../")
from pygraph.kernels.marginalizedKernel import marginalizedkernel
from pygraph.utils.graphfiles import loadDataset from pygraph.utils.graphfiles import loadDataset
from pygraph.kernels.marginalizedKernel import marginalizedkernel
from pygraph.kernels.untilHPathKernel import untilhpathkernel
from pygraph.kernels.spKernel import spkernel
import functools
from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct
from pygraph.kernels.structuralspKernel import structuralspkernel




ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb

DN, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
DN = DN[0:10]

lmbda = 0.03 # termination probalility
r_max = 10 # recursions
l = 500
alpha_range = np.linspace(0.1, 0.9, 9)
k = 5 # k nearest neighbors

# randomly select two molecules
np.random.seed(1)
idx1, idx2 = np.random.randint(0, len(DN), 2)
g1 = DN[idx1]
g2 = DN[idx2]
def compute_kernel(Gn, graph_kernel, verbose):
if graph_kernel == 'marginalizedkernel':
Kmatrix, _ = marginalizedkernel(Gn, node_label='atom', edge_label=None,
p_quit=0.03, n_iteration=20, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'untilhpathkernel':
Kmatrix, _ = untilhpathkernel(Gn, node_label='atom', edge_label='bond_type',
depth=10, k_func='MinMax', compute_method='trie',
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'spkernel':
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
Kmatrix, _, _ = spkernel(Gn, node_label='atom', node_kernels=
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel},
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'structuralspkernel':
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
Kmatrix, _ = structuralspkernel(Gn, node_label='atom', node_kernels=
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel},
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
# normalization
# Kmatrix_diag = Kmatrix.diagonal().copy()
# for i in range(len(Kmatrix)):
# for j in range(i, len(Kmatrix)):
# Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
# Kmatrix[j][i] = Kmatrix[i][j]
return Kmatrix


# compute
k_list = [] # kernel between each graph and itself.
k_g1_list = [] # kernel between each graph and g1
k_g2_list = [] # kernel between each graph and g2
for ig, g in tqdm(enumerate(DN), desc='computing self kernels', file=sys.stdout):
ktemp = marginalizedkernel([g, g1, g2], node_label='atom', edge_label=None,
p_quit=lmbda, n_iteration=20, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=False)
k_list.append(ktemp[0][0, 0])
k_g1_list.append(ktemp[0][0, 1])
k_g2_list.append(ktemp[0][0, 2])


g_best = []
dis_best = []
# for each alpha
for alpha in alpha_range:
print('alpha =', alpha)
# compute k nearest neighbors of phi in DN.
dis_list = [] # distance between g_star and each graph.
for ig, g in tqdm(enumerate(DN), desc='computing distances', file=sys.stdout):
dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) *
k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha *
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha *
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
dis_list.append(dtemp)
if __name__ == '__main__':
# sort
sort_idx = np.argsort(dis_list)
dis_gs = [dis_list[idis] for idis in sort_idx[0:k]]
g0hat = DN[sort_idx[0]] # the nearest neighbor of phi in DN
if dis_gs[0] == 0: # the exact pre-image.
print('The exact pre-image is found from the input dataset.')
g_pimg = g0hat
break
dhat = dis_gs[0] # the nearest distance
Dk = [DN[ig] for ig in sort_idx[0:k]] # the k nearest neighbors
gihat_list = []
# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
# 'extra_params': {}} # node/edge symb
# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
# 'extra_params': {}} # node nsymb
# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
# 'extra_params': {}}
ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
'extra_params': {}} # node symb
i = 1
r = 1
while r < r_max:
print('r =', r)
found = False
for ig, gs in enumerate(Dk + gihat_list):
# nx.draw_networkx(gs)
# plt.show()
fdgs = int(np.abs(np.ceil(np.log(alpha * dis_gs[ig])))) # @todo ???
for trail in tqdm(range(0, l), desc='l loop', file=sys.stdout):
# add and delete edges.
gtemp = gs.copy()
np.random.seed()
# which edges to change.
idx_change = np.random.randint(0, nx.number_of_nodes(gs) *
(nx.number_of_nodes(gs) - 1), fdgs)
for item in idx_change:
node1 = int(item / (nx.number_of_nodes(gs) - 1))
node2 = (item - node1 * (nx.number_of_nodes(gs) - 1))
if node2 >= node1:
node2 += 1
# @todo: is the randomness correct?
if not gtemp.has_edge(node1, node2):
gtemp.add_edges_from([(node1, node2, {'bond_type': 0})])
# nx.draw_networkx(gs)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
else:
gtemp.remove_edge(node1, node2)
# nx.draw_networkx(gs)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
# compute distance between phi and the new generated graph.
knew = marginalizedkernel([gtemp, g1, g2], node_label='atom', edge_label=None,
p_quit=lmbda, n_iteration=20, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=False)
dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) *
knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha *
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha *
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
if dnew <= dhat: # the new distance is smaller
print('I am smaller!')
dhat = dnew
gnew = gtemp.copy()
found = True # found better graph.
r = 0
if found:
gihat_list = [gnew]
dis_gs.append(dhat)
else:
r += 1
dis_best.append(dhat)
g_best += ([g0hat] if len(gihat_list) == 0 else gihat_list)

for idx, item in enumerate(alpha_range):
print('when alpha is', item, 'the shortest distance is', dis_best[idx])
print('the corresponding pre-image is')
nx.draw_networkx(g_best[idx])
plt.show()
DN, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
#DN = DN[0:10]
lmbda = 0.03 # termination probalility
r_max = 10 # recursions
l = 500
alpha_range = np.linspace(0.5, 0.5, 1)
#alpha_range = np.linspace(0.1, 0.9, 9)
k = 5 # k nearest neighbors
# randomly select two molecules
#np.random.seed(1)
#idx1, idx2 = np.random.randint(0, len(DN), 2)
#g1 = DN[idx1]
#g2 = DN[idx2]
idx1 = 0
idx2 = 6
g1 = DN[idx1]
g2 = DN[idx2]
# compute
k_list = [] # kernel between each graph and itself.
k_g1_list = [] # kernel between each graph and g1
k_g2_list = [] # kernel between each graph and g2
for ig, g in tqdm(enumerate(DN), desc='computing self kernels', file=sys.stdout):
# ktemp = marginalizedkernel([g, g1, g2], node_label='atom', edge_label=None,
# p_quit=lmbda, n_iteration=20, remove_totters=False,
# n_jobs=multiprocessing.cpu_count(), verbose=False)
ktemp = compute_kernel([g, g1, g2], 'untilhpathkernel', verbose=False)
k_list.append(ktemp[0, 0])
k_g1_list.append(ktemp[0, 1])
k_g2_list.append(ktemp[0, 2])
g_best = []
dis_best = []
# for each alpha
for alpha in alpha_range:
print('alpha =', alpha)
# compute k nearest neighbors of phi in DN.
dis_list = [] # distance between g_star and each graph.
for ig, g in tqdm(enumerate(DN), desc='computing distances', file=sys.stdout):
dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) *
k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha *
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha *
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
dis_list.append(np.sqrt(dtemp))
# sort
sort_idx = np.argsort(dis_list)
dis_gs = [dis_list[idis] for idis in sort_idx[0:k]]
g0hat = DN[sort_idx[0]] # the nearest neighbor of phi in DN
if dis_gs[0] == 0: # the exact pre-image.
print('The exact pre-image is found from the input dataset.')
g_pimg = g0hat
break
dhat = dis_gs[0] # the nearest distance
Dk = [DN[ig] for ig in sort_idx[0:k]] # the k nearest neighbors
gihat_list = []
i = 1
r = 1
while r < r_max:
print('r =', r)
found = False
for ig, gs in enumerate(Dk + gihat_list):
# nx.draw_networkx(gs)
# plt.show()
# @todo what if the log is negetive?
fdgs = int(np.abs(np.ceil(np.log(alpha * dis_gs[ig]))))
for trail in tqdm(range(0, l), desc='l loop', file=sys.stdout):
# add and delete edges.
gtemp = gs.copy()
np.random.seed()
# which edges to change.
# @todo: should we use just half of the adjacency matrix for undirected graphs?
nb_vpairs = nx.number_of_nodes(gs) * (nx.number_of_nodes(gs) - 1)
# @todo: what if fdgs is bigger than nb_vpairs?
idx_change = random.sample(range(nb_vpairs), fdgs if fdgs < nb_vpairs else nb_vpairs)
# idx_change = np.random.randint(0, nx.number_of_nodes(gs) *
# (nx.number_of_nodes(gs) - 1), fdgs)
for item in idx_change:
node1 = int(item / (nx.number_of_nodes(gs) - 1))
node2 = (item - node1 * (nx.number_of_nodes(gs) - 1))
if node2 >= node1: # skip the self pair.
node2 += 1
# @todo: is the randomness correct?
if not gtemp.has_edge(node1, node2):
# @todo: how to update the bond_type? 0 or 1?
gtemp.add_edges_from([(node1, node2, {'bond_type': 1})])
# nx.draw_networkx(gs)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
else:
gtemp.remove_edge(node1, node2)
# nx.draw_networkx(gs)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
# compute distance between phi and the new generated graph.
# knew = marginalizedkernel([gtemp, g1, g2], node_label='atom', edge_label=None,
# p_quit=lmbda, n_iteration=20, remove_totters=False,
# n_jobs=multiprocessing.cpu_count(), verbose=False)
knew = compute_kernel([gtemp, g1, g2], 'untilhpathkernel', verbose=False)
dnew = np.sqrt(knew[0, 0] - 2 * (alpha * knew[0, 1] + (1 - alpha) *
knew[0, 2]) + (alpha * alpha * k_list[idx1] + alpha *
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha *
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]))
if dnew < dhat: # @todo: the new distance is smaller or also equal?
print('I am smaller!')
print(dhat, '->', dnew)
nx.draw_networkx(gtemp)
plt.show()
print(gtemp.nodes(data=True))
print(gtemp.edges(data=True))
dhat = dnew
gnew = gtemp.copy()
found = True # found better graph.
r = 0
elif dnew == dhat:
print('I am equal!')
if found:
gihat_list = [gnew]
dis_gs.append(dhat)
else:
r += 1
dis_best.append(dhat)
g_best += ([g0hat] if len(gihat_list) == 0 else gihat_list)
for idx, item in enumerate(alpha_range):
print('when alpha is', item, 'the shortest distance is', dis_best[idx])
print('the corresponding pre-image is')
nx.draw_networkx(g_best[idx])
plt.show()

+ 111
- 72
pygraph/kernels/treeletKernel.py View File

@@ -24,6 +24,7 @@ def treeletkernel(*args,
sub_kernel, sub_kernel,
node_label='atom', node_label='atom',
edge_label='bond_type', edge_label='bond_type',
parallel='imap_unordered',
n_jobs=None, n_jobs=None,
verbose=True): verbose=True):
"""Calculate treelet graph kernels between graphs. """Calculate treelet graph kernels between graphs.
@@ -70,34 +71,55 @@ def treeletkernel(*args,
start_time = time.time() start_time = time.time()
# ---- use pool.imap_unordered to parallel and track progress. ---- # ---- use pool.imap_unordered to parallel and track progress. ----
# get all canonical keys of all graphs before calculating kernels to save
# time, but this may cost a lot of memory for large dataset.
pool = Pool(n_jobs)
itr = zip(Gn, range(0, len(Gn)))
if len(Gn) < 100 * n_jobs:
chunksize = int(len(Gn) / n_jobs) + 1
else:
chunksize = 100
canonkeys = [[] for _ in range(len(Gn))]
get_partial = partial(wrapper_get_canonkeys, node_label, edge_label,
labeled, ds_attrs['is_directed'])
if verbose:
iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize),
desc='getting canonkeys', file=sys.stdout)
if parallel == 'imap_unordered':
# get all canonical keys of all graphs before calculating kernels to save
# time, but this may cost a lot of memory for large dataset.
pool = Pool(n_jobs)
itr = zip(Gn, range(0, len(Gn)))
if len(Gn) < 100 * n_jobs:
chunksize = int(len(Gn) / n_jobs) + 1
else:
chunksize = 100
canonkeys = [[] for _ in range(len(Gn))]
get_partial = partial(wrapper_get_canonkeys, node_label, edge_label,
labeled, ds_attrs['is_directed'])
if verbose:
iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize),
desc='getting canonkeys', file=sys.stdout)
else:
iterator = pool.imap_unordered(get_partial, itr, chunksize)
for i, ck in iterator:
canonkeys[i] = ck
pool.close()
pool.join()
# compute kernels.
def init_worker(canonkeys_toshare):
global G_canonkeys
G_canonkeys = canonkeys_toshare
do_partial = partial(wrapper_treeletkernel_do, sub_kernel)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(canonkeys,), n_jobs=n_jobs, verbose=verbose)
# ---- do not use parallelization. ----
elif parallel == None:
# get all canonical keys of all graphs before calculating kernels to save
# time, but this may cost a lot of memory for large dataset.
canonkeys = []
for g in (tqdm(Gn, desc='getting canonkeys', file=sys.stdout) if verbose else Gn):
canonkeys.append(get_canonkeys(g, node_label, edge_label, labeled,
ds_attrs['is_directed']))
# compute kernels.
from itertools import combinations_with_replacement
itr = combinations_with_replacement(range(0, len(Gn)), 2)
for i, j in (tqdm(itr, desc='getting canonkeys', file=sys.stdout) if verbose else itr):
Kmatrix[i][j] = _treeletkernel_do(canonkeys[i], canonkeys[j], sub_kernel)
Kmatrix[j][i] = Kmatrix[i][j] # @todo: no directed graph considered?
else: else:
iterator = pool.imap_unordered(get_partial, itr, chunksize)
for i, ck in iterator:
canonkeys[i] = ck
pool.close()
pool.join()
# compute kernels.
def init_worker(canonkeys_toshare):
global G_canonkeys
G_canonkeys = canonkeys_toshare
do_partial = partial(wrapper_treeletkernel_do, sub_kernel)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(canonkeys,), n_jobs=n_jobs, verbose=verbose)
raise Exception('No proper parallelization method designated.')

run_time = time.time() - start_time run_time = time.time() - start_time
if verbose: if verbose:
@@ -123,8 +145,7 @@ def _treeletkernel_do(canonkey1, canonkey2, sub_kernel):
keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs
vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys]) vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys])
vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys]) vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys])
kernel = np.sum(np.exp(-np.square(vector1 - vector2) / 2))
# kernel = sub_kernel(vector1, vector2)
kernel = sub_kernel(vector1, vector2)
return kernel return kernel




@@ -266,7 +287,7 @@ def get_canonkeys(G, node_label, edge_label, labeled, is_directed):
# linear patterns # linear patterns
canonkey_t = Counter(list(nx.get_node_attributes(G, node_label).values())) canonkey_t = Counter(list(nx.get_node_attributes(G, node_label).values()))
for key in canonkey_t: for key in canonkey_t:
canonkey_l['0' + key] = canonkey_t[key]
canonkey_l[('0', key)] = canonkey_t[key]


for i in range(1, 6): # for i in range(1, 6): for i in range(1, 6): # for i in range(1, 6):
treelet = [] treelet = []
@@ -274,93 +295,111 @@ def get_canonkeys(G, node_label, edge_label, labeled, is_directed):
canonlist = list(chain.from_iterable((G.node[node][node_label], \ canonlist = list(chain.from_iterable((G.node[node][node_label], \
G[node][pattern[idx+1]][edge_label]) for idx, node in enumerate(pattern[:-1]))) G[node][pattern[idx+1]][edge_label]) for idx, node in enumerate(pattern[:-1])))
canonlist.append(G.node[pattern[-1]][node_label]) canonlist.append(G.node[pattern[-1]][node_label])
canonkey_t = ''.join(canonlist)
canonkey_t = canonkey_t if canonkey_t < canonkey_t[::-1] else canonkey_t[::-1]
treelet.append(str(i) + canonkey_t)
canonkey_t = canonlist if canonlist < canonlist[::-1] else canonlist[::-1]
treelet.append(tuple([str(i)] + canonkey_t))
canonkey_l.update(Counter(treelet)) canonkey_l.update(Counter(treelet))


# n-star patterns # n-star patterns
for i in range(3, 6): for i in range(3, 6):
treelet = [] treelet = []
for pattern in patterns[str(i) + 'star']: for pattern in patterns[str(i) + 'star']:
canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:] ]
canonlist = [tuple((G.node[leaf][node_label],
G[leaf][pattern[0]][edge_label])) for leaf in pattern[1:]]
canonlist.sort() canonlist.sort()
canonkey_t = ('d' if i == 5 else str(i * 2)) + G.node[pattern[0]][node_label] + ''.join(canonlist)
canonlist = list(chain.from_iterable(canonlist))
canonkey_t = tuple(['d' if i == 5 else str(i * 2)] +
[G.node[pattern[0]][node_label]] + canonlist)
treelet.append(canonkey_t) treelet.append(canonkey_t)
canonkey_l.update(Counter(treelet)) canonkey_l.update(Counter(treelet))


# pattern 7 # pattern 7
treelet = [] treelet = []
for pattern in patterns['7']: for pattern in patterns['7']:
canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ]
canonlist = [tuple((G.node[leaf][node_label],
G[leaf][pattern[0]][edge_label])) for leaf in pattern[1:3]]
canonlist.sort() canonlist.sort()
canonkey_t = '7' + G.node[pattern[0]][node_label] + ''.join(canonlist) \
+ G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] \
+ G.node[pattern[4]][node_label] + G[pattern[4]][pattern[3]][edge_label]
canonlist = list(chain.from_iterable(canonlist))
canonkey_t = tuple(['7'] + [G.node[pattern[0]][node_label]] + canonlist
+ [G.node[pattern[3]][node_label]]
+ [G[pattern[3]][pattern[0]][edge_label]]
+ [G.node[pattern[4]][node_label]]
+ [G[pattern[4]][pattern[3]][edge_label]])
treelet.append(canonkey_t) treelet.append(canonkey_t)
canonkey_l.update(Counter(treelet)) canonkey_l.update(Counter(treelet))


# pattern 11 # pattern 11
treelet = [] treelet = []
for pattern in patterns['11']: for pattern in patterns['11']:
canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:4] ]
canonlist = [tuple((G.node[leaf][node_label],
G[leaf][pattern[0]][edge_label])) for leaf in pattern[1:4]]
canonlist.sort() canonlist.sort()
canonkey_t = 'b' + G.node[pattern[0]][node_label] + ''.join(canonlist) \
+ G.node[pattern[4]][node_label] + G[pattern[4]][pattern[0]][edge_label] \
+ G.node[pattern[5]][node_label] + G[pattern[5]][pattern[4]][edge_label]
canonlist = list(chain.from_iterable(canonlist))
canonkey_t = tuple(['b'] + [G.node[pattern[0]][node_label]] + canonlist
+ [G.node[pattern[4]][node_label]]
+ [G[pattern[4]][pattern[0]][edge_label]]
+ [G.node[pattern[5]][node_label]]
+ [G[pattern[5]][pattern[4]][edge_label]])
treelet.append(canonkey_t) treelet.append(canonkey_t)
canonkey_l.update(Counter(treelet)) canonkey_l.update(Counter(treelet))


# pattern 10 # pattern 10
treelet = [] treelet = []
for pattern in patterns['10']: for pattern in patterns['10']:
canonkey4 = G.node[pattern[5]][node_label] + G[pattern[5]][pattern[4]][edge_label]
canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ]
canonkey4 = [G.node[pattern[5]][node_label], G[pattern[5]][pattern[4]][edge_label]]
canonlist = [tuple((G.node[leaf][node_label],
G[leaf][pattern[0]][edge_label])) for leaf in pattern[1:3]]
canonlist.sort() canonlist.sort()
canonkey0 = ''.join(canonlist)
canonkey_t = 'a' + G.node[pattern[3]][node_label] \
+ G.node[pattern[4]][node_label] + G[pattern[4]][pattern[3]][edge_label] \
+ G.node[pattern[0]][node_label] + G[pattern[0]][pattern[3]][edge_label] \
+ canonkey4 + canonkey0
canonkey0 = list(chain.from_iterable(canonlist))
canonkey_t = tuple(['a'] + [G.node[pattern[3]][node_label]]
+ [G.node[pattern[4]][node_label]]
+ [G[pattern[4]][pattern[3]][edge_label]]
+ [G.node[pattern[0]][node_label]]
+ [G[pattern[0]][pattern[3]][edge_label]]
+ canonkey4 + canonkey0)
treelet.append(canonkey_t) treelet.append(canonkey_t)
canonkey_l.update(Counter(treelet)) canonkey_l.update(Counter(treelet))


# pattern 12 # pattern 12
treelet = [] treelet = []
for pattern in patterns['12']: for pattern in patterns['12']:
canonlist0 = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ]
canonlist0 = [tuple((G.node[leaf][node_label],
G[leaf][pattern[0]][edge_label])) for leaf in pattern[1:3]]
canonlist0.sort() canonlist0.sort()
canonlist3 = [ G.node[leaf][node_label] + G[leaf][pattern[3]][edge_label] for leaf in pattern[4:6] ]
canonlist0 = list(chain.from_iterable(canonlist0))
canonlist3 = [tuple((G.node[leaf][node_label],
G[leaf][pattern[3]][edge_label])) for leaf in pattern[4:6]]
canonlist3.sort() canonlist3.sort()
canonlist3 = list(chain.from_iterable(canonlist3))
# 2 possible key can be generated from 2 nodes with extended label 3, select the one with lower lexicographic order.
canonkey_t1 = 'c' + G.node[pattern[0]][node_label] \
+ ''.join(canonlist0) \
+ G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] \
+ ''.join(canonlist3)

canonkey_t2 = 'c' + G.node[pattern[3]][node_label] \
+ ''.join(canonlist3) \
+ G.node[pattern[0]][node_label] + G[pattern[0]][pattern[3]][edge_label] \
+ ''.join(canonlist0)

# 2 possible key can be generated from 2 nodes with extended label 3,
# select the one with lower lexicographic order.
canonkey_t1 = tuple(['c'] + [G.node[pattern[0]][node_label]] + canonlist0
+ [G.node[pattern[3]][node_label]]
+ [G[pattern[3]][pattern[0]][edge_label]]
+ canonlist3)
canonkey_t2 = tuple(['c'] + [G.node[pattern[3]][node_label]] + canonlist3
+ [G.node[pattern[0]][node_label]]
+ [G[pattern[0]][pattern[3]][edge_label]]
+ canonlist0)
treelet.append(canonkey_t1 if canonkey_t1 < canonkey_t2 else canonkey_t2) treelet.append(canonkey_t1 if canonkey_t1 < canonkey_t2 else canonkey_t2)
canonkey_l.update(Counter(treelet)) canonkey_l.update(Counter(treelet))


# pattern 9 # pattern 9
treelet = [] treelet = []
for pattern in patterns['9']: for pattern in patterns['9']:
canonkey2 = G.node[pattern[4]][node_label] + G[pattern[4]][pattern[2]][edge_label]
canonkey3 = G.node[pattern[5]][node_label] + G[pattern[5]][pattern[3]][edge_label]
prekey2 = G.node[pattern[2]][node_label] + G[pattern[2]][pattern[0]][edge_label]
prekey3 = G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label]
canonkey2 = [G.node[pattern[4]][node_label], G[pattern[4]][pattern[2]][edge_label]]
canonkey3 = [G.node[pattern[5]][node_label], G[pattern[5]][pattern[3]][edge_label]]
prekey2 = [G.node[pattern[2]][node_label], G[pattern[2]][pattern[0]][edge_label]]
prekey3 = [G.node[pattern[3]][node_label], G[pattern[3]][pattern[0]][edge_label]]
if prekey2 + canonkey2 < prekey3 + canonkey3: if prekey2 + canonkey2 < prekey3 + canonkey3:
canonkey_t = G.node[pattern[1]][node_label] + G[pattern[1]][pattern[0]][edge_label] \
+ prekey2 + prekey3 + canonkey2 + canonkey3
canonkey_t = [G.node[pattern[1]][node_label]] \
+ [G[pattern[1]][pattern[0]][edge_label]] \
+ prekey2 + prekey3 + canonkey2 + canonkey3
else: else:
canonkey_t = G.node[pattern[1]][node_label] + G[pattern[1]][pattern[0]][edge_label] \
+ prekey3 + prekey2 + canonkey3 + canonkey2
treelet.append('9' + G.node[pattern[0]][node_label] + canonkey_t)
canonkey_t = [G.node[pattern[1]][node_label]] \
+ [G[pattern[1]][pattern[0]][edge_label]] \
+ prekey3 + prekey2 + canonkey3 + canonkey2
treelet.append(tuple(['9'] + [G.node[pattern[0]][node_label]] + canonkey_t))
canonkey_l.update(Counter(treelet)) canonkey_l.update(Counter(treelet))


return canonkey_l return canonkey_l


+ 19
- 1
pygraph/utils/graphfiles.py View File

@@ -84,7 +84,7 @@ def loadGXL(filename):
return g return g




def saveGXL(graph, filename, method='gedlib'):
def saveGXL(graph, filename, method='gedlib-letter'):
if method == 'benoit': if method == 'benoit':
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
root_node = ET.Element('gxl') root_node = ET.Element('gxl')
@@ -142,6 +142,24 @@ def saveGXL(graph, filename, method='gedlib'):
gxl_file.write("</graph>\n") gxl_file.write("</graph>\n")
gxl_file.write("</gxl>\n") gxl_file.write("</gxl>\n")
gxl_file.close() gxl_file.close()
elif method == 'gedlib-letter':
# reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22
# and https://github.com/dbblumenthal/gedlib/blob/master/data/datasets/Letter/HIGH/AP1_0000.gxl
gxl_file = open(filename, 'w')
gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n")
gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n")
gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">")
for v, attrs in graph.nodes(data=True):
gxl_file.write("<node id=\"_" + str(v) + "\">")
gxl_file.write("<attr name=\"x\"><float>" + str(attrs['attributes'][0]) + "</float></attr>")
gxl_file.write("<attr name=\"y\"><float>" + str(attrs['attributes'][1]) + "</float></attr>")
gxl_file.write("</node>")
for v1, v2, attrs in graph.edges(data=True):
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\"/>")
gxl_file.write("</graph>")
gxl_file.write("</gxl>")
gxl_file.close()




def loadSDF(filename): def loadSDF(filename):


+ 3
- 3
pygraph/utils/model_selection_precomputed.py View File

@@ -227,9 +227,9 @@ def model_selection_for_precomputed_kernel(datafile,
str_fw += '\nall gram matrices are ignored, no results obtained.\n\n' str_fw += '\nall gram matrices are ignored, no results obtained.\n\n'
else: else:
# save gram matrices to file. # save gram matrices to file.
np.savez(results_dir + '/' + ds_name + '.gm',
gms=gram_matrices, params=param_list_pre_revised, y=y,
gmtime=gram_matrix_time)
# np.savez(results_dir + '/' + ds_name + '.gm',
# gms=gram_matrices, params=param_list_pre_revised, y=y,
# gmtime=gram_matrix_time)
if verbose: if verbose:
print( print(
'3. Fitting and predicting using nested cross validation. This could really take a while...' '3. Fitting and predicting using nested cross validation. This could really take a while...'


Loading…
Cancel
Save