@@ -49,47 +49,14 @@ def load_dataset(filename, filename_targets=None, gformat=None, **kwargs):
if extension == "ds":
data, y, label_names = load_from_ds(filename, filename_targets)
elif extension == "cxl":
import xml.etree.ElementTree as ET
dirname_dataset = dirname(filename)
tree = ET.parse(filename)
root = tree.getroot()
data = []
y = []
for graph in root.iter('graph'):
mol_filename = graph.attrib['file']
mol_class = graph.attrib['class']
data.append(load_gxl(dirname_dataset + '/' + mol_filename))
y.append(mol_class)
dir_dataset = kwargs.get('dirname_dataset', None)
data, y, label_names = load_from_xml(filename, dir_dataset)
elif extension == 'xml':
dir_dataset = kwargs.get('dirname_dataset', None)
data, y = loadFromXML(filename, dir_dataset)
elif extension == "sdf":
# import numpy as np
from tqdm import tqdm
import sys
data = loadSDF(filename)
y_raw = open(filename_targets).read().splitlines()
y_raw.pop(0)
tmp0 = []
tmp1 = []
for i in range(0, len(y_raw)):
tmp = y_raw[i].split(',')
tmp0.append(tmp[0])
tmp1.append(tmp[1].strip())
y = []
for i in tqdm(range(0, len(data)), desc='ajust data', file=sys.stdout):
try:
y.append(tmp1[tmp0.index(data[i].name)].strip())
except ValueError: # if data[i].name not in tmp0
data[i] = []
data = list(filter(lambda a: a != [], data))
data, y, label_names = load_from_xml(filename, dir_dataset)
elif extension == "mat":
order = kwargs.get('order')
data, y = loadMAT (filename, order)
data, y, label_names = load_mat(filename, order)
elif extension == 'txt':
data, y, label_names = load_tud(filename)
@@ -127,7 +94,7 @@ def save_dataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=Non
fgroup.close()
def load_ct(filename):
def load_ct(filename): # @todo: this function is only tested on CTFile V2000; header not considered; only simple cases (atoms and bonds are considered.)
"""load data from a Chemical Table (.ct) file.
Notes
@@ -154,30 +121,65 @@ def load_ct(filename):
g = nx.Graph()
with open(filename) as f:
content = f.read().splitlines()
g = nx.Graph(
name = str(content[0]),
filename = basename(filename)) # set name of the graph
tmp = content[1].split(" ")
if tmp[0] == '':
nb_nodes = int(tmp[1]) # number of the nodes
nb_edges = int(tmp[2]) # number of the edges
else:
nb_nodes = int(tmp[0])
nb_edges = int(tmp[1])
# patch for compatibility : label will be removed later
for i in range(0, nb_nodes):
tmp = content[i + 2].split(" ")
g = nx.Graph(name=str(content[0]), filename=basename(filename)) # set name of the graph
# read the counts line.
tmp = content[1].split(' ')
tmp = [x for x in tmp if x != '']
nb_atoms = int(tmp[0].strip()) # number of atoms
nb_bonds = int(tmp[1].strip()) # number of bonds
count_line_tags = ['number_of_atoms', 'number_of_bonds', 'number_of_atom_lists', '', 'chiral_flag', 'number_of_stext_entries', '', '', '', '', 'number_of_properties', 'CT_version']
i = 0
while i < len(tmp):
if count_line_tags[i] != '': # if not obsoleted
g.graph[count_line_tags[i]] = tmp[i].strip()
i += 1
# read the atom block.
atom_tags = ['x', 'y', 'z', 'atom_symbol', 'mass_difference', 'charge', 'atom_stereo_parity', 'hydrogen_count_plus_1', 'stereo_care_box', 'valence', 'h0_designator', '', '', 'atom_atom_mapping_number', 'inversion_retention_flag', 'exact_change_flag']
for i in range(0, nb_atoms):
tmp = content[i + 2].split(' ')
tmp = [x for x in tmp if x != '']
g.add_node(i, atom=tmp[3].strip(),
label=[item.strip() for item in tmp[3:]],
attributes=[item.strip() for item in tmp[0:3]])
for i in range(0, nb_edges):
tmp = content[i + g.number_of_nodes() + 2].split(" ")
g.add_node(i)
j = 0
while j < len(tmp):
if atom_tags[j] != '':
g.nodes[i][atom_tags[j]] = tmp[j].strip()
j += 1
# read the bond block.
bond_tags = ['first_atom_number', 'second_atom_number', 'bond_type', 'bond_stereo', '', 'bond_topology', 'reacting_center_status']
for i in range(0, nb_bonds):
tmp = content[i + g.number_of_nodes() + 2].split(' ')
tmp = [x for x in tmp if x != '']
g.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1,
bond_type=tmp[2].strip(),
label=[item.strip() for item in tmp[2:]])
return g
n1, n2 = int(tmp[0].strip()) - 1, int(tmp[1].strip()) - 1
g.add_edge(n1, n2)
j = 2
while j < len(tmp):
if bond_tags[j] != '':
g.edges[(n1, n2)][bond_tags[j]] = tmp[j].strip()
j += 1
# get label names.
label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []}
atom_symbolic = [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, None, None, 1, 1, 1]
for nd in g.nodes():
for key in g.nodes[nd]:
if atom_symbolic[atom_tags.index(key)] == 1:
label_names['node_labels'].append(key)
else:
label_names['node_attrs'].append(key)
break
bond_symbolic = [None, None, 1, 1, None, 1, 1]
for ed in g.edges():
for key in g.edges[ed]:
if bond_symbolic[bond_tags.index(key)] == 1:
label_names['edge_labels'].append(key)
else:
label_names['edge_attrs'].append(key)
break
return g, label_names
def load_gxl(filename): # @todo: directed graphs.
@@ -333,57 +335,57 @@ def saveGXL(graph, filename, method='default', node_labels=[], edge_labels=[], n
gxl_file.close()
def loadSDF(filename):
"""load data from structured data file (.sdf file).
# def loadSDF(filename):
# """load data from structured data file (.sdf file).
Notes
------
A SDF file contains a group of molecules, represented in the similar way as in MOL format.
Check `here <http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx>`__ for detailed structure.
"""
import networkx as nx
from os.path import basename
from tqdm import tqdm
import sys
data = []
with open(filename) as f:
content = f.read().splitlines()
index = 0
pbar = tqdm(total=len(content) + 1, desc='load SDF', file=sys.stdout)
while index < len(content):
index_old = index
# Notes
# ------
# A SDF file contains a group of molecules, represented in the similar way as in MOL format.
# Check `here <http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx>`__ for detailed structure.
# """
# import networkx as nx
# from os.path import basename
# from tqdm import tqdm
# import sys
# data = []
# with open(filename) as f:
# content = f.read().splitlines()
# index = 0
# pbar = tqdm(total=len(content) + 1, desc='load SDF', file=sys.stdout)
# while index < len(content):
# index_old = index
g = nx.Graph(name=content[index].strip()) # set name of the graph
# g = nx.Graph(name=content[index].strip()) # set name of the graph
tmp = content[index + 3]
nb_nodes = int(tmp[:3]) # number of the nodes
nb_edges = int(tmp[3:6]) # number of the edges
# tmp = content[index + 3]
# nb_nodes = int(tmp[:3]) # number of the nodes
# nb_edges = int(tmp[3:6]) # number of the edges
for i in range(0, nb_nodes):
tmp = content[i + index + 4]
g.add_node(i, atom=tmp[31:34].strip())
# for i in range(0, nb_nodes):
# tmp = content[i + index + 4]
# g.add_node(i, atom=tmp[31:34].strip())
for i in range(0, nb_edges):
tmp = content[i + index + g.number_of_nodes() + 4]
tmp = [tmp[i:i + 3] for i in range(0, len(tmp), 3)]
g.add_edge(
int(tmp[0]) - 1, int(tmp[1]) - 1, bond_type=tmp[2].strip())
# for i in range(0, nb_edges):
# tmp = content[i + index + g.number_of_nodes() + 4]
# tmp = [tmp[i:i + 3] for i in range(0, len(tmp), 3)]
# g.add_edge(
# int(tmp[0]) - 1, int(tmp[1]) - 1, bond_type=tmp[2].strip())
data.append(g)
# data.append(g)
index += 4 + g.number_of_nodes() + g.number_of_edges()
while content[index].strip() != '$$$$': # seperator
index += 1
index += 1
# index += 4 + g.number_of_nodes() + g.number_of_edges()
# while content[index].strip() != '$$$$': # seperator
# index += 1
# index += 1
pbar.update(index - index_old)
pbar.update(1)
pbar.close()
# pbar.update(index - index_old)
# pbar.update(1)
# pbar.close()
return data
# return data
def loadMAT(filename, order):
def load_mat(filename, order): # @todo: need to be updated (auto order) or deprecated.
"""Load graph data from a MATLAB (up to version 7.1) .mat file.
Notes
@@ -422,14 +424,13 @@ def loadMAT(filename, order):
# print(item[order[3]])
# print()
for index, label in enumerate(nl[0]):
g.add_node(index, atom =str(label))
g.add_node(index, label_1 =str(label))
el = item[order[4]][0][0][0] # edge label
for edge in el:
g.add_edge(
edge[0] - 1, edge[1] - 1, bond_type=str(edge[2]))
g.add_edge(edge[0] - 1, edge[1] - 1, label_1=str(edge[2]))
data.append(g)
else:
from scipy.sparse import csc_matrix
# from scipy.sparse import csc_matrix
for i, item in enumerate(value[0]):
# print(item)
# print('------')
@@ -438,7 +439,7 @@ def loadMAT(filename, order):
# print(nl)
# print()
for index, label in enumerate(nl[0]):
g.add_node(index, atom =str(label))
g.add_node(index, label_1 =str(label))
sam = item[order[0]] # sparse adjacency matrix
index_no0 = sam.nonzero()
for col, row in zip(index_no0[0], index_no0[1]):
@@ -447,7 +448,12 @@ def loadMAT(filename, order):
g.add_edge(col, row)
data.append(g)
# print(g.edges(data=True))
return data, y
label_names = {'node_labels': ['label_1'], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []}
if order[1] == 0:
label_names['edge_labels'].append('label_1')
return data, y, label_names
def load_tud(filename):
@@ -645,26 +651,6 @@ def load_tud(filename):
data[g].edges[n[0], n[1]][a_name] = attrs[i]
return data, targets, label_names
def loadFromXML(filename, dir_dataset=None):
import xml.etree.ElementTree as ET
if dir_dataset is not None:
dir_dataset = dir_dataset
else:
dir_dataset = dirname(filename)
tree = ET.parse(filename)
root = tree.getroot()
data = []
y = []
for graph in root.iter('graph'):
mol_filename = graph.attrib['file']
mol_class = graph.attrib['class']
data.append(load_gxl(dir_dataset + '/' + mol_filename))
y.append(mol_class)
return data, y
def load_from_ds(filename, filename_targets):
@@ -678,47 +664,33 @@ def load_from_ds(filename, filename_targets):
Note these graph formats are checked automatically by the extensions of
graph files.
"""
def append_label_names(label_names, new_names):
for key, val in label_names.items():
label_names[key] += [name for name in new_names[key] if name not in val]
"""
dirname_dataset = dirname(filename)
data = []
y = []
label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []}
content = open(filename).read().splitlines()
extension = splitext(content[0].split(' ')[0])[1][1:]
if extension == 'ct':
load_file_fun = load_ct
elif extension == 'gxl' or extension == 'sdf': # @todo: .sdf not tested yet.
load_file_fun = load_gxl
if filename_targets is None or filename_targets == '':
if extension == 'ct':
for i in range(0, len(content)):
tmp = content[i].split(' ')
# remove the '#'s in file names
data.append(
load_ct(dirname_dataset + '/' + tmp[0].replace('#', '', 1)))
y.append(float(tmp[1]))
elif extension == 'gxl':
for i in range(0, len(content)):
tmp = content[i].split(' ')
# remove the '#'s in file names
g, l_names = load_gxl(dirname_dataset + '/' + tmp[0].replace('#', '', 1))
data.append(g)
append_label_names(label_names, l_names)
y.append(float(tmp[1]))
else: # y in a seperate file
if extension == 'ct':
for i in range(0, len(content)):
tmp = content[i]
# remove the '#'s in file names
data.append(
load_ct(dirname_dataset + '/' + tmp.replace('#', '', 1)))
elif extension == 'gxl':
for i in range(0, len(content)):
tmp = content[i]
# remove the '#'s in file names
g, l_names = load_gxl(dirname_dataset + '/' + tmp[0].replace('#', '', 1))
data.append(g)
append_label_names(label_names, l_names)
for i in range(0, len(content)):
tmp = content[i].split(' ')
# remove the '#'s in file names
g, l_names = load_file_fun(dirname_dataset + '/' + tmp[0].replace('#', '', 1))
data.append(g)
__append_label_names(label_names, l_names)
y.append(float(tmp[1]))
else: # targets in a seperate file
for i in range(0, len(content)):
tmp = content[i]
# remove the '#'s in file names
g, l_names = load_file_fun(dirname_dataset + '/' + tmp.replace('#', '', 1))
data.append(g)
__append_label_names(label_names, l_names)
content_y = open(filename_targets).read().splitlines()
# assume entries in filename and filename_targets have the same order.
@@ -728,7 +700,50 @@ def load_from_ds(filename, filename_targets):
y.append(float(tmp[2]))
return data, y, label_names
# def load_from_cxl(filename):
# import xml.etree.ElementTree as ET
#
# dirname_dataset = dirname(filename)
# tree = ET.parse(filename)
# root = tree.getroot()
# data = []
# y = []
# for graph in root.iter('graph'):
# mol_filename = graph.attrib['file']
# mol_class = graph.attrib['class']
# data.append(load_gxl(dirname_dataset + '/' + mol_filename))
# y.append(mol_class)
def load_from_xml(filename, dir_dataset=None):
import xml.etree.ElementTree as ET
if dir_dataset is not None:
dir_dataset = dir_dataset
else:
dir_dataset = dirname(filename)
tree = ET.parse(filename)
root = tree.getroot()
data = []
y = []
label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []}
for graph in root.iter('graph'):
mol_filename = graph.attrib['file']
mol_class = graph.attrib['class']
g, l_names = load_gxl(dir_dataset + '/' + mol_filename)
data.append(g)
__append_label_names(label_names, l_names)
y.append(mol_class)
return data, y, label_names
def __append_label_names(label_names, new_names):
for key, val in label_names.items():
label_names[key] += [name for name in new_names[key] if name not in val]
if __name__ == '__main__':
# ### Load dataset from .ds file.
@@ -736,25 +751,34 @@ if __name__ == '__main__':
# ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds',
# 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'}
# Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y'])
## ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb
## Gn, y = loadDataset(ds['dataset'] )
## ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb
## Gn, y = loadDataset(ds['dataset'] )
# ds_file = '../../datasets/Acyclic/dataset_bps.ds' # node symb
# Gn, targets, label_names = load_dataset(ds_file )
# ds_file = '../../datasets/MAO/dataset.ds' # node/edge symb
# Gn, targets, label_names = load_dataset(ds_file )
## ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled
## Gn, y = loadDataset(ds['dataset'])
# print(Gn[1].nodes(data=True))
# print(Gn[1].edges(data=True))
# print(y[1])
# print(Gn[1].graph)
# print(Gn[1].nodes(data=True))
# print(Gn[1].edges(data=True))
# print(targets[1])
# .gxl file.
ds = {'name': 'monoterpenoides',
'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
Gn, y, label_names = load_dataset(ds['dataset'])
# # .gxl file.
# ds_file = '../../datasets/monoterpenoides/dataset_10+.ds' # node/edge symb
# Gn, y, label_names = load_dataset(ds_file)
# print(Gn[1].graph)
# print(Gn[1].nodes(data=True))
# print(Gn[1].edges(data=True))
# print(y[1])
# .mat file.
ds_file = '../../datasets/MUTAG_mat/MUTAG.mat'
order = [0, 0, 3, 1, 2]
Gn, targets, label_names = load_dataset(ds_file, order=order)
print(Gn[1].graph)
print(Gn[1].nodes(data=True))
print(Gn[1].edges(data=True))
print(y[1])
print(targets [1])
# ### Convert graph from one format to another.
# # .gxl file.
# import networkx as nx