diff --git a/gklearn/dataset/file_managers.py b/gklearn/dataset/file_managers.py index 2fa06f9..3a6e831 100644 --- a/gklearn/dataset/file_managers.py +++ b/gklearn/dataset/file_managers.py @@ -537,16 +537,18 @@ class DataLoader(): def load_cml(self, filename): # @todo: directed graphs. + # @todo: what is "atomParity" and "bondStereo" in the data file? from os.path import basename import networkx as nx import xml.etree.ElementTree as ET + xmlns = '{http://www.xml-cml.org/schema}' # @todo: why this has to be added? tree = ET.parse(filename) root = tree.getroot() index = 0 - g = nx.Graph(filename=basename(filename), name=root.attrib['id']) + g_id = root.find(xmlns + 'molecule').attrib['id'] + g = nx.Graph(filename=basename(filename), name=g_id) dic = {} # used to retrieve incident nodes of edges - xmlns = '{http://www.xml-cml.org/schema}' # @todo: why this has to be added? for atom in root.iter(xmlns + 'atom'): dic[atom.attrib['id']] = index labels = {} diff --git a/gklearn/dataset/metadata.py b/gklearn/dataset/metadata.py index d2a11da..ed3af41 100644 --- a/gklearn/dataset/metadata.py +++ b/gklearn/dataset/metadata.py @@ -33,6 +33,7 @@ GREYC_META = { 'train_valid_test': [], 'stereoisomerism': True, 'load_files': ['data.ds'], + 'extra_params': {'gformat': 'cml'} }, 'Acyclic': { 'database': 'greyc', @@ -224,7 +225,8 @@ GREYC_META = { 'domain': 'small molecules', 'train_valid_test': [], 'stereoisomerism': True, - 'load_files': [], + 'load_files': ['data.txt'], + 'extra_params': {'gformat': 'cml'} }, }