@@ -127,7 +127,7 @@ def save_dataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=Non
fgroup.close()
fgroup.close()
def load_ct(filename):
def load_ct(filename): # @todo: this function is only tested on CTFile V2000; header not considered; only simple cases (atoms and bonds are considered.)
"""load data from a Chemical Table (.ct) file.
"""load data from a Chemical Table (.ct) file.
Notes
Notes
@@ -154,30 +154,65 @@ def load_ct(filename):
g = nx.Graph()
g = nx.Graph()
with open(filename) as f:
with open(filename) as f:
content = f.read().splitlines()
content = f.read().splitlines()
g = nx.Graph(
name = str(content[0]),
filename = basename(filename)) # set name of the graph
tmp = content[1].split(" ")
if tmp[0] == '':
nb_nodes = int(tmp[1]) # number of the nodes
nb_edges = int(tmp[2]) # number of the edges
else:
nb_nodes = int(tmp[0])
nb_edges = int(tmp[1])
# patch for compatibility : label will be removed later
for i in range(0, nb_nodes):
tmp = content[i + 2].split(" ")
g = nx.Graph(name=str(content[0]), filename=basename(filename)) # set name of the graph
# read the counts line.
tmp = content[1].split(' ')
tmp = [x for x in tmp if x != '']
nb_atoms = int(tmp[0].strip()) # number of atoms
nb_bonds = int(tmp[1].strip()) # number of bonds
count_line_tags = ['number_of_atoms', 'number_of_bonds', 'number_of_atom_lists', '', 'chiral_flag', 'number_of_stext_entries', '', '', '', '', 'number_of_properties', 'CT_version']
i = 0
while i < len(tmp):
if count_line_tags[i] != '': # if not obsoleted
g.graph[count_line_tags[i]] = tmp[i].strip()
i += 1
# read the atom block.
atom_tags = ['x', 'y', 'z', 'atom_symbol', 'mass_difference', 'charge', 'atom_stereo_parity', 'hydrogen_count_plus_1', 'stereo_care_box', 'valence', 'h0_designator', '', '', 'atom_atom_mapping_number', 'inversion_retention_flag', 'exact_change_flag']
for i in range(0, nb_atoms):
tmp = content[i + 2].split(' ')
tmp = [x for x in tmp if x != '']
tmp = [x for x in tmp if x != '']
g.add_node(i, atom=tmp[3].strip(),
label=[item.strip() for item in tmp[3:]],
attributes=[item.strip() for item in tmp[0:3]])
for i in range(0, nb_edges):
tmp = content[i + g.number_of_nodes() + 2].split(" ")
g.add_node(i)
j = 0
while j < len(tmp):
if atom_tags[j] != '':
g.nodes[i][atom_tags[j]] = tmp[j].strip()
j += 1
# read the bond block.
bond_tags = ['first_atom_number', 'second_atom_number', 'bond_type', 'bond_stereo', '', 'bond_topology', 'reacting_center_status']
for i in range(0, nb_bonds):
tmp = content[i + g.number_of_nodes() + 2].split(' ')
tmp = [x for x in tmp if x != '']
tmp = [x for x in tmp if x != '']
g.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1,
bond_type=tmp[2].strip(),
label=[item.strip() for item in tmp[2:]])
return g
n1, n2 = int(tmp[0].strip()) - 1, int(tmp[1].strip()) - 1
g.add_edge(n1, n2)
j = 2
while j < len(tmp):
if bond_tags[j] != '':
g.edges[(n1, n2)][bond_tags[j]] = tmp[j].strip()
j += 1
# get label names.
label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []}
atom_symbolic = [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, None, None, 1, 1, 1]
for nd in g.nodes():
for key in g.nodes[nd]:
if atom_symbolic[atom_tags.index(key)] == 1:
label_names['node_labels'].append(key)
else:
label_names['node_attrs'].append(key)
break
bond_symbolic = [None, None, 1, 1, None, 1, 1]
for ed in g.edges():
for key in g.edges[ed]:
if bond_symbolic[bond_tags.index(key)] == 1:
label_names['edge_labels'].append(key)
else:
label_names['edge_attrs'].append(key)
break
return g, label_names
def load_gxl(filename): # @todo: directed graphs.
def load_gxl(filename): # @todo: directed graphs.
@@ -678,11 +713,7 @@ def load_from_ds(filename, filename_targets):
Note these graph formats are checked automatically by the extensions of
Note these graph formats are checked automatically by the extensions of
graph files.
graph files.
"""
def append_label_names(label_names, new_names):
for key, val in label_names.items():
label_names[key] += [name for name in new_names[key] if name not in val]
"""
dirname_dataset = dirname(filename)
dirname_dataset = dirname(filename)
data = []
data = []
y = []
y = []
@@ -694,8 +725,9 @@ def load_from_ds(filename, filename_targets):
for i in range(0, len(content)):
for i in range(0, len(content)):
tmp = content[i].split(' ')
tmp = content[i].split(' ')
# remove the '#'s in file names
# remove the '#'s in file names
data.append(
load_ct(dirname_dataset + '/' + tmp[0].replace('#', '', 1)))
g, l_names = load_ct(dirname_dataset + '/' + tmp[0].replace('#', '', 1))
data.append(g)
__append_label_names(label_names, l_names)
y.append(float(tmp[1]))
y.append(float(tmp[1]))
elif extension == 'gxl':
elif extension == 'gxl':
for i in range(0, len(content)):
for i in range(0, len(content)):
@@ -703,22 +735,23 @@ def load_from_ds(filename, filename_targets):
# remove the '#'s in file names
# remove the '#'s in file names
g, l_names = load_gxl(dirname_dataset + '/' + tmp[0].replace('#', '', 1))
g, l_names = load_gxl(dirname_dataset + '/' + tmp[0].replace('#', '', 1))
data.append(g)
data.append(g)
append_label_names(label_names, l_names)
__ append_label_names(label_names, l_names)
y.append(float(tmp[1]))
y.append(float(tmp[1]))
else: # y in a seperate file
else: # y in a seperate file
if extension == 'ct':
if extension == 'ct':
for i in range(0, len(content)):
for i in range(0, len(content)):
tmp = content[i]
tmp = content[i]
# remove the '#'s in file names
# remove the '#'s in file names
data.append(
load_ct(dirname_dataset + '/' + tmp.replace('#', '', 1)))
g, l_names = load_ct(dirname_dataset + '/' + tmp.replace('#', '', 1))
data.append(g)
__append_label_names(label_names, l_names)
elif extension == 'gxl':
elif extension == 'gxl':
for i in range(0, len(content)):
for i in range(0, len(content)):
tmp = content[i]
tmp = content[i]
# remove the '#'s in file names
# remove the '#'s in file names
g, l_names = load_gxl(dirname_dataset + '/' + tmp[0] .replace('#', '', 1))
g, l_names = load_gxl(dirname_dataset + '/' + tmp.replace('#', '', 1))
data.append(g)
data.append(g)
append_label_names(label_names, l_names)
__ append_label_names(label_names, l_names)
content_y = open(filename_targets).read().splitlines()
content_y = open(filename_targets).read().splitlines()
# assume entries in filename and filename_targets have the same order.
# assume entries in filename and filename_targets have the same order.
@@ -728,7 +761,12 @@ def load_from_ds(filename, filename_targets):
y.append(float(tmp[2]))
y.append(float(tmp[2]))
return data, y, label_names
return data, y, label_names
def __append_label_names(label_names, new_names):
for key, val in label_names.items():
label_names[key] += [name for name in new_names[key] if name not in val]
if __name__ == '__main__':
if __name__ == '__main__':
# ### Load dataset from .ds file.
# ### Load dataset from .ds file.
@@ -736,24 +774,24 @@ if __name__ == '__main__':
# ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds',
# ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds',
# 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'}
# 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'}
# Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y'])
# Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y'])
## ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb
## Gn, y = loadDataset(ds['dataset'] )
ds_file = '../../datasets/acyclic/dataset_bps.ds' # node symb
Gn, targets, label_names = load_dataset(ds_file )
## ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb
## ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb
## Gn, y = loadDataset(ds['dataset'])
## Gn, y = loadDataset(ds['dataset'])
## ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled
## ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled
## Gn, y = loadDataset(ds['dataset'])
## Gn, y = loadDataset(ds['dataset'])
# print(Gn[1].nodes(data=True))
# print(Gn[1].edges(data=True))
# print(y[1])
# .gxl file.
ds = {'name': 'monoterpenoides',
'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
Gn, y, label_names = load_dataset(ds['dataset'])
print(Gn[1].graph)
print(Gn[1].graph)
print(Gn[1].nodes(data=True))
print(Gn[1].nodes(data=True))
print(Gn[1].edges(data=True))
print(Gn[1].edges(data=True))
print(y[1])
print(targets[1])
# # .gxl file.
# ds_file = '../../datasets/monoterpenoides/dataset_10+.ds' # node/edge symb
# Gn, y, label_names = load_dataset(ds_file)
# print(Gn[1].graph)
# print(Gn[1].nodes(data=True))
# print(Gn[1].edges(data=True))
# print(y[1])
# ### Convert graph from one format to another.
# ### Convert graph from one format to another.
# # .gxl file.
# # .gxl file.