- """ Utilities function to manage graph files
- """
-
- def loadCT(filename):
- """load data from .ct file.
- nn
- Notes
- ------
- a typical example of data in .ct is like this:
-
- 3 2 <- number of nodes and edges
- 0.0000 0.0000 0.0000 C <- each line describes a node (x,y,z + label)
- 0.0000 0.0000 0.0000 C
- 0.0000 0.0000 0.0000 O
- 1 3 1 1 <- each line describes an edge : to, from,?, label
- 2 3 1 1
- """
- import networkx as nx
- from os.path import basename
- g = nx.Graph()
- with open(filename) as f:
- content = f.read().splitlines()
- g = nx.Graph(name=str(content[0]), filename=basename(filename)) # set name of the graph
- tmp = content[1].split(" ")
- if tmp[0] == '':
- nb_nodes = int(tmp[1]) # number of the nodes
- nb_edges = int(tmp[2]) # number of the edges
- else:
- nb_nodes = int(tmp[0])
- nb_edges = int(tmp[1])
- # patch for compatibility : label will be removed later
- for i in range(0, nb_nodes):
- tmp = content[i + 2].split(" ")
- tmp = [x for x in tmp if x != '']
- g.add_node(i, atom=tmp[3], label=tmp[3])
- for i in range(0, nb_edges):
- tmp = content[i + g.number_of_nodes() + 2].split(" ")
- tmp = [x for x in tmp if x != '']
- g.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1,
- bond_type=tmp[3].strip(), label=tmp[3].strip())
-
- # for i in range(0, nb_edges):
- # tmp = content[i + g.number_of_nodes() + 2]
- # tmp = [tmp[i:i+3] for i in range(0, len(tmp), 3)]
- # g.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1,
- # bond_type=tmp[3].strip(), label=tmp[3].strip())
- return g
-
-
- def loadGXL(filename):
- from os.path import basename
- import networkx as nx
- import xml.etree.ElementTree as ET
-
- tree = ET.parse(filename)
- root = tree.getroot()
- index = 0
- g = nx.Graph(filename=basename(filename), name=root[0].attrib['id'])
- dic = {} #used to retrieve incident nodes of edges
- for node in root.iter('node'):
- dic[node.attrib['id']] = index
- labels = {}
- for attr in node.iter('attr'):
- labels[attr.attrib['name']] = attr[0].text
- if 'chem' in labels:
- labels['label'] = labels['chem']
- g.add_node(index, **labels)
- index += 1
-
- for edge in root.iter('edge'):
- labels = {}
- for attr in edge.iter('attr'):
- labels[attr.attrib['name']] = attr[0].text
- if 'valence' in labels:
- labels['label'] = labels['valence']
- g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels)
- return g
-
- def saveGXL(graph, filename):
- import xml.etree.ElementTree as ET
- root_node = ET.Element('gxl')
- attr = dict()
- attr['id'] = graph.graph['name']
- attr['edgeids'] = 'true'
- attr['edgemode'] = 'undirected'
- graph_node = ET.SubElement(root_node, 'graph', attrib=attr)
-
- for v in graph:
- current_node = ET.SubElement(graph_node, 'node', attrib={'id' : str(v)})
- for attr in graph.nodes[v].keys():
- cur_attr = ET.SubElement(current_node, 'attr', attrib={'name' : attr})
- cur_value = ET.SubElement(cur_attr,graph.nodes[v][attr].__class__.__name__)
- cur_value.text = graph.nodes[v][attr]
-
- for v1 in graph:
- for v2 in graph[v1]:
- if(v1 < v2): #Non oriented graphs
- cur_edge = ET.SubElement(graph_node, 'edge', attrib={'from' : str(v1),
- 'to' : str(v2)})
- for attr in graph[v1][v2].keys():
- cur_attr = ET.SubElement(cur_edge, 'attr', attrib={'name' : attr})
- cur_value = ET.SubElement(cur_attr, graph[v1][v2][attr].__class__.__name__)
- cur_value.text = str(graph[v1][v2][attr])
-
- tree = ET.ElementTree(root_node)
- tree.write(filename)
-
-
- def loadSDF(filename):
- """load data from structured data file (.sdf file).
-
- Notes
- ------
- A SDF file contains a group of molecules, represented in the similar way as in MOL format.
- see http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx, 2018 for detailed structure.
- """
- import networkx as nx
- from os.path import basename
- from tqdm import tqdm
- import sys
- data = []
- with open(filename) as f:
- content = f.read().splitlines()
- index = 0
- pbar = tqdm(total = len(content) + 1, desc = 'load SDF', file=sys.stdout)
- while index < len(content):
- index_old = index
-
- g = nx.Graph(name=content[index].strip()) # set name of the graph
-
- tmp = content[index + 3]
- nb_nodes = int(tmp[:3]) # number of the nodes
- nb_edges = int(tmp[3:6]) # number of the edges
-
- for i in range(0, nb_nodes):
- tmp = content[i + index + 4]
- g.add_node(i, atom=tmp[31:34].strip())
-
- for i in range(0, nb_edges):
- tmp = content[i + index + g.number_of_nodes() + 4]
- tmp = [tmp[i:i+3] for i in range(0, len(tmp), 3)]
- g.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1, bond_type=tmp[2].strip())
-
- data.append(g)
-
- index += 4 + g.number_of_nodes() + g.number_of_edges()
- while content[index].strip() != '$$$$': # seperator
- index += 1
- index += 1
-
- pbar.update(index - index_old)
- pbar.update(1)
- pbar.close()
-
- return data
-
-
-
- def loadDataset(filename, filename_y = ''):
- """load file list of the dataset.
- """
- from os.path import dirname, splitext
-
- dirname_dataset = dirname(filename)
- extension = splitext(filename)[1][1:]
- data = []
- y = []
- if extension == "ds":
- content = open(filename).read().splitlines()
- for i in range(0, len(content)):
- tmp = content[i].split(' ')
- data.append(loadCT(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) # remove the '#'s in file names
- y.append(float(tmp[1]))
- elif(extension == "cxl"):
- import xml.etree.ElementTree as ET
-
- tree = ET.parse(filename)
- root = tree.getroot()
- data = []
- y = []
- for graph in root.iter('print'):
- mol_filename = graph.attrib['file']
- mol_class = graph.attrib['class']
- data.append(loadGXL(dirname_dataset + '/' + mol_filename))
- y.append(mol_class)
- elif extension == "sdf":
- import numpy as np
- from tqdm import tqdm
- import sys
-
- data = loadSDF(filename)
-
- y_raw = open(filename_y).read().splitlines()
- y_raw.pop(0)
- tmp0 = []
- tmp1 = []
- for i in range(0, len(y_raw)):
- tmp = y_raw[i].split(',')
- tmp0.append(tmp[0])
- tmp1.append(tmp[1].strip())
-
- y = []
- for i in tqdm(range(0, len(data)), desc = 'ajust data', file=sys.stdout):
- try:
- y.append(tmp1[tmp0.index(data[i].name)].strip())
- except ValueError: # if data[i].name not in tmp0
- data[i] = []
- data = list(filter(lambda a: a != [], data))
-
- return data, y
|