From c74c87e0cb46d3f7927d2493fe93f1ae25e495a1 Mon Sep 17 00:00:00 2001 From: linlin Date: Sun, 4 Oct 2020 19:17:07 +0200 Subject: [PATCH] New translations utils.py (French) --- lang/fr/gklearn/utils/utils.py | 605 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 605 insertions(+) create mode 100644 lang/fr/gklearn/utils/utils.py diff --git a/lang/fr/gklearn/utils/utils.py b/lang/fr/gklearn/utils/utils.py new file mode 100644 index 0000000..c32169d --- /dev/null +++ b/lang/fr/gklearn/utils/utils.py @@ -0,0 +1,605 @@ +import networkx as nx +import numpy as np +from copy import deepcopy +from enum import Enum, unique +#from itertools import product + +# from tqdm import tqdm + + +def getSPLengths(G1): + sp = nx.shortest_path(G1) + distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes())) + for i in sp.keys(): + for j in sp[i].keys(): + distances[i, j] = len(sp[i][j]) - 1 + return distances + + +def getSPGraph(G, edge_weight=None): + """Transform graph G to its corresponding shortest-paths graph. + + Parameters + ---------- + G : NetworkX graph + The graph to be tramsformed. + edge_weight : string + edge attribute corresponding to the edge weight. + + Return + ------ + S : NetworkX graph + The shortest-paths graph corresponding to G. + + Notes + ------ + For an input graph G, its corresponding shortest-paths graph S contains the same set of nodes as G, while there exists an edge between all nodes in S which are connected by a walk in G. Every edge in S between two nodes is labeled by the shortest distance between these two nodes. + + References + ---------- + .. [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. + """ + return floydTransformation(G, edge_weight=edge_weight) + + +def floydTransformation(G, edge_weight=None): + """Transform graph G to its corresponding shortest-paths graph using Floyd-transformation. + + Parameters + ---------- + G : NetworkX graph + The graph to be tramsformed. + edge_weight : string + edge attribute corresponding to the edge weight. The default edge weight is bond_type. + + Return + ------ + S : NetworkX graph + The shortest-paths graph corresponding to G. + + References + ---------- + .. [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. + """ + spMatrix = nx.floyd_warshall_numpy(G, weight=edge_weight) + S = nx.Graph() + S.add_nodes_from(G.nodes(data=True)) + ns = list(G.nodes()) + for i in range(0, G.number_of_nodes()): + for j in range(i + 1, G.number_of_nodes()): + if spMatrix[i, j] != np.inf: + S.add_edge(ns[i], ns[j], cost=spMatrix[i, j]) + return S + + +def get_shortest_paths(G, weight, directed): + """Get all shortest paths of a graph. + + Parameters + ---------- + G : NetworkX graphs + The graphs whose paths are calculated. + weight : string/None + edge attribute used as weight to calculate the shortest path. + directed: boolean + Whether graph is directed. + + Return + ------ + sp : list of list + List of shortest paths of the graph, where each path is represented by a list of nodes. + """ + from itertools import combinations + sp = [] + for n1, n2 in combinations(G.nodes(), 2): + try: + spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight)) + except nx.NetworkXNoPath: # nodes not connected + pass + else: + sp += spltemp + # each edge walk is counted twice, starting from both its extreme nodes. + if not directed: + sp += [sptemp[::-1] for sptemp in spltemp] + + # add single nodes as length 0 paths. + sp += [[n] for n in G.nodes()] + return sp + + +def untotterTransformation(G, node_label, edge_label): + """Transform graph G according to Mahé et al.'s work to filter out tottering patterns of marginalized kernel and tree pattern kernel. + + Parameters + ---------- + G : NetworkX graph + The graph to be tramsformed. + node_label : string + node attribute used as label. The default node label is 'atom'. + edge_label : string + edge attribute used as label. The default edge label is 'bond_type'. + + Return + ------ + gt : NetworkX graph + The transformed graph corresponding to G. + + References + ---------- + .. [1] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and Jean-Philippe Vert. Extensions of marginalized graph kernels. In Proceedings of the twenty-first international conference on Machine learning, page 70. ACM, 2004. + """ + # arrange all graphs in a list + G = G.to_directed() + gt = nx.Graph() + gt.graph = G.graph + gt.add_nodes_from(G.nodes(data=True)) + for edge in G.edges(): + gt.add_node(edge) + gt.nodes[edge].update({node_label: G.nodes[edge[1]][node_label]}) + gt.add_edge(edge[0], edge) + gt.edges[edge[0], edge].update({ + edge_label: + G[edge[0]][edge[1]][edge_label] + }) + for neighbor in G[edge[1]]: + if neighbor != edge[0]: + gt.add_edge(edge, (edge[1], neighbor)) + gt.edges[edge, (edge[1], neighbor)].update({ + edge_label: + G[edge[1]][neighbor][edge_label] + }) + # nx.draw_networkx(gt) + # plt.show() + + # relabel nodes using consecutive integers for convenience of kernel calculation. + gt = nx.convert_node_labels_to_integers( + gt, first_label=0, label_attribute='label_orignal') + return gt + + +def direct_product(G1, G2, node_label, edge_label): + """Return the direct/tensor product of directed graphs G1 and G2. + + Parameters + ---------- + G1, G2 : NetworkX graph + The original graphs. + node_label : string + node attribute used as label. The default node label is 'atom'. + edge_label : string + edge attribute used as label. The default edge label is 'bond_type'. + + Return + ------ + gt : NetworkX graph + The direct product graph of G1 and G2. + + Notes + ----- + This method differs from networkx.tensor_product in that this method only adds nodes and edges in G1 and G2 that have the same labels to the direct product graph. + + References + ---------- + .. [1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: Hardness results and efficient alternatives. Learning Theory and Kernel Machines, pages 129–143, 2003. + """ + # arrange all graphs in a list + from itertools import product + # G = G.to_directed() + gt = nx.DiGraph() + # add nodes + for u, v in product(G1, G2): + if G1.nodes[u][node_label] == G2.nodes[v][node_label]: + gt.add_node((u, v)) + gt.nodes[(u, v)].update({node_label: G1.nodes[u][node_label]}) + # add edges, faster for sparse graphs (no so many edges), which is the most case for now. + for (u1, v1), (u2, v2) in product(G1.edges, G2.edges): + if (u1, u2) in gt and ( + v1, v2 + ) in gt and G1.edges[u1, v1][edge_label] == G2.edges[u2, + v2][edge_label]: + gt.add_edge((u1, u2), (v1, v2)) + gt.edges[(u1, u2), (v1, v2)].update({ + edge_label: + G1.edges[u1, v1][edge_label] + }) + + # # add edges, faster for dense graphs (a lot of edges, complete graph would be super). + # for u, v in product(gt, gt): + # if (u[0], v[0]) in G1.edges and ( + # u[1], v[1] + # ) in G2.edges and G1.edges[u[0], + # v[0]][edge_label] == G2.edges[u[1], + # v[1]][edge_label]: + # gt.add_edge((u[0], u[1]), (v[0], v[1])) + # gt.edges[(u[0], u[1]), (v[0], v[1])].update({ + # edge_label: + # G1.edges[u[0], v[0]][edge_label] + # }) + + # relabel nodes using consecutive integers for convenience of kernel calculation. + # gt = nx.convert_node_labels_to_integers( + # gt, first_label=0, label_attribute='label_orignal') + return gt + + +def direct_product_graph(G1, G2, node_labels, edge_labels): + """Return the direct/tensor product of directed graphs G1 and G2. + + Parameters + ---------- + G1, G2 : NetworkX graph + The original graphs. + node_labels : list + A list of node attributes used as labels. + edge_labels : list + A list of edge attributes used as labels. + + Return + ------ + gt : NetworkX graph + The direct product graph of G1 and G2. + + Notes + ----- + This method differs from networkx.tensor_product in that this method only adds nodes and edges in G1 and G2 that have the same labels to the direct product graph. + + References + ---------- + .. [1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: Hardness results and efficient alternatives. Learning Theory and Kernel Machines, pages 129–143, 2003. + """ + # arrange all graphs in a list + from itertools import product + # G = G.to_directed() + gt = nx.DiGraph() + # add nodes + for u, v in product(G1, G2): + label1 = tuple(G1.nodes[u][nl] for nl in node_labels) + label2 = tuple(G2.nodes[v][nl] for nl in node_labels) + if label1 == label2: + gt.add_node((u, v), node_label=label1) + + # add edges, faster for sparse graphs (no so many edges), which is the most case for now. + for (u1, v1), (u2, v2) in product(G1.edges, G2.edges): + if (u1, u2) in gt and (v1, v2) in gt: + label1 = tuple(G1.edges[u1, v1][el] for el in edge_labels) + label2 = tuple(G2.edges[u2, v2][el] for el in edge_labels) + if label1 == label2: + gt.add_edge((u1, u2), (v1, v2), edge_label=label1) + + + # # add edges, faster for dense graphs (a lot of edges, complete graph would be super). + # for u, v in product(gt, gt): + # if (u[0], v[0]) in G1.edges and ( + # u[1], v[1] + # ) in G2.edges and G1.edges[u[0], + # v[0]][edge_label] == G2.edges[u[1], + # v[1]][edge_label]: + # gt.add_edge((u[0], u[1]), (v[0], v[1])) + # gt.edges[(u[0], u[1]), (v[0], v[1])].update({ + # edge_label: + # G1.edges[u[0], v[0]][edge_label] + # }) + + # relabel nodes using consecutive integers for convenience of kernel calculation. + # gt = nx.convert_node_labels_to_integers( + # gt, first_label=0, label_attribute='label_orignal') + return gt + + +def graph_deepcopy(G): + """Deep copy a graph, including deep copy of all nodes, edges and + attributes of the graph, nodes and edges. + + Note + ---- + It is the same as the NetworkX function graph.copy(), as far as I know. + """ + # add graph attributes. + labels = {} + for k, v in G.graph.items(): + labels[k] = deepcopy(v) + if G.is_directed(): + G_copy = nx.DiGraph(**labels) + else: + G_copy = nx.Graph(**labels) + + # add nodes + for nd, attrs in G.nodes(data=True): + labels = {} + for k, v in attrs.items(): + labels[k] = deepcopy(v) + G_copy.add_node(nd, **labels) + + # add edges. + for nd1, nd2, attrs in G.edges(data=True): + labels = {} + for k, v in attrs.items(): + labels[k] = deepcopy(v) + G_copy.add_edge(nd1, nd2, **labels) + + return G_copy + + +def graph_isIdentical(G1, G2): + """Check if two graphs are identical, including: same nodes, edges, node + labels/attributes, edge labels/attributes. + + Notes + ----- + 1. The type of graphs has to be the same. + + 2. Global/Graph attributes are neglected as they may contain names for graphs. + """ + # check nodes. + nlist1 = [n for n in G1.nodes(data=True)] + nlist2 = [n for n in G2.nodes(data=True)] + if not nlist1 == nlist2: + return False + # check edges. + elist1 = [n for n in G1.edges(data=True)] + elist2 = [n for n in G2.edges(data=True)] + if not elist1 == elist2: + return False + # check graph attributes. + + return True + + +def get_node_labels(Gn, node_label): + """Get node labels of dataset Gn. + """ + nl = set() + for G in Gn: + nl = nl | set(nx.get_node_attributes(G, node_label).values()) + return nl + + +def get_edge_labels(Gn, edge_label): + """Get edge labels of dataset Gn. + """ + el = set() + for G in Gn: + el = el | set(nx.get_edge_attributes(G, edge_label).values()) + return el + + +def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}): + if name == 'Marginalized': + from gklearn.kernels import Marginalized + graph_kernel = Marginalized(node_labels=node_labels, + edge_labels=edge_labels, + ds_infos=ds_infos, + **kernel_options) + elif name == 'ShortestPath': + from gklearn.kernels import ShortestPath + graph_kernel = ShortestPath(node_labels=node_labels, + node_attrs=node_attrs, + ds_infos=ds_infos, + **kernel_options) + elif name == 'StructuralSP': + from gklearn.kernels import StructuralSP + graph_kernel = StructuralSP(node_labels=node_labels, + edge_labels=edge_labels, + node_attrs=node_attrs, + edge_attrs=edge_attrs, + ds_infos=ds_infos, + **kernel_options) + elif name == 'PathUpToH': + from gklearn.kernels import PathUpToH + graph_kernel = PathUpToH(node_labels=node_labels, + edge_labels=edge_labels, + ds_infos=ds_infos, + **kernel_options) + elif name == 'Treelet': + from gklearn.kernels import Treelet + graph_kernel = Treelet(node_labels=node_labels, + edge_labels=edge_labels, + ds_infos=ds_infos, + **kernel_options) + elif name == 'WLSubtree': + from gklearn.kernels import WLSubtree + graph_kernel = WLSubtree(node_labels=node_labels, + edge_labels=edge_labels, + ds_infos=ds_infos, + **kernel_options) + elif name == 'WeisfeilerLehman': + from gklearn.kernels import WeisfeilerLehman + graph_kernel = WeisfeilerLehman(node_labels=node_labels, + edge_labels=edge_labels, + ds_infos=ds_infos, + **kernel_options) + else: + raise Exception('The graph kernel given is not defined. Possible choices include: "StructuralSP", "ShortestPath", "PathUpToH", "Treelet", "WLSubtree", "WeisfeilerLehman".') + + return graph_kernel + + +def compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, dir_save='', irrelevant_labels=None, edge_required=False): + import os + from gklearn.utils import Dataset, split_dataset_by_target + + # 1. get dataset. + print('1. getting dataset...') + dataset_all = Dataset() + dataset_all.load_predefined_dataset(ds_name) + dataset_all.trim_dataset(edge_required=edge_required) + if not irrelevant_labels is None: + dataset_all.remove_labels(**irrelevant_labels) +# dataset_all.cut_graphs(range(0, 10)) + datasets = split_dataset_by_target(dataset_all) + + gram_matrix_unnorm_list = [] + run_time_list = [] + + print('start generating preimage for each class of target...') + for idx, dataset in enumerate(datasets): + target = dataset.targets[0] + print('\ntarget =', target, '\n') + + # 2. initialize graph kernel. + print('2. initializing graph kernel and setting parameters...') + graph_kernel = get_graph_kernel_by_name(kernel_options['name'], + node_labels=dataset.node_labels, + edge_labels=dataset.edge_labels, + node_attrs=dataset.node_attrs, + edge_attrs=dataset.edge_attrs, + ds_infos=dataset.get_dataset_infos(keys=['directed']), + kernel_options=kernel_options) + + # 3. compute gram matrix. + print('3. computing gram matrix...') + gram_matrix, run_time = graph_kernel.compute(dataset.graphs, **kernel_options) + gram_matrix_unnorm = graph_kernel.gram_matrix_unnorm + + gram_matrix_unnorm_list.append(gram_matrix_unnorm) + run_time_list.append(run_time) + + # 4. save results. + print() + print('4. saving results...') + if save_results: + if not os.path.exists(dir_save): + os.makedirs(dir_save) + np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=run_time_list) + + print('\ncomplete.') + + +def find_paths(G, source_node, length): + """Find all paths with a certain length those start from a source node. + A recursive depth first search is applied. + + Parameters + ---------- + G : NetworkX graphs + The graph in which paths are searched. + source_node : integer + The number of the node from where all paths start. + length : integer + The length of paths. + + Return + ------ + path : list of list + List of paths retrieved, where each path is represented by a list of nodes. + """ + if length == 0: + return [[source_node]] + path = [[source_node] + path for neighbor in G[source_node] \ + for path in find_paths(G, neighbor, length - 1) if source_node not in path] + return path + + +def find_all_paths(G, length, is_directed): + """Find all paths with a certain length in a graph. A recursive depth first + search is applied. + + Parameters + ---------- + G : NetworkX graphs + The graph in which paths are searched. + length : integer + The length of paths. + + Return + ------ + path : list of list + List of paths retrieved, where each path is represented by a list of nodes. + """ + all_paths = [] + for node in G: + all_paths.extend(find_paths(G, node, length)) + + if not is_directed: + # For each path, two presentations are retrieved from its two extremities. + # Remove one of them. + all_paths_r = [path[::-1] for path in all_paths] + for idx, path in enumerate(all_paths[:-1]): + for path2 in all_paths_r[idx+1::]: + if path == path2: + all_paths[idx] = [] + break + all_paths = list(filter(lambda a: a != [], all_paths)) + + return all_paths + + +def get_mlti_dim_node_attrs(G, attr_names): + attributes = [] + for nd, attrs in G.nodes(data=True): + attributes.append(tuple(attrs[aname] for aname in attr_names)) + return attributes + + +def get_mlti_dim_edge_attrs(G, attr_names): + attributes = [] + for ed, attrs in G.edges(data=True): + attributes.append(tuple(attrs[aname] for aname in attr_names)) + return attributes + + +def normalize_gram_matrix(gram_matrix): + diag = gram_matrix.diagonal().copy() + for i in range(len(gram_matrix)): + for j in range(i, len(gram_matrix)): + gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j]) + gram_matrix[j][i] = gram_matrix[i][j] + return gram_matrix + + +def compute_distance_matrix(gram_matrix): + dis_mat = np.empty((len(gram_matrix), len(gram_matrix))) + for i in range(len(gram_matrix)): + for j in range(i, len(gram_matrix)): + dis = gram_matrix[i, i] + gram_matrix[j, j] - 2 * gram_matrix[i, j] + if dis < 0: + if dis > -1e-10: + dis = 0 + else: + raise ValueError('The distance is negative.') + dis_mat[i, j] = np.sqrt(dis) + dis_mat[j, i] = dis_mat[i, j] + dis_max = np.max(np.max(dis_mat)) + dis_min = np.min(np.min(dis_mat[dis_mat != 0])) + dis_mean = np.mean(np.mean(dis_mat)) + return dis_mat, dis_max, dis_min, dis_mean + + +def dummy_node(): + """ + /*! + * @brief Returns a dummy node. + * @return ID of dummy node. + */ + """ + return np.inf # @todo: in GEDLIB, this is the max - 1 rather than max, I don't know why. + + +def undefined_node(): + """ + /*! + * @brief Returns an undefined node. + * @return ID of undefined node. + */ + + """ + return np.inf + + +def dummy_edge(): + """ + /*! + * @brief Returns a dummy edge. + * @return ID of dummy edge. + */ + + """ + return np.inf + + +@unique +class SpecialLabel(Enum): + """can be used to define special labels. + """ + DUMMY = 1 # The dummy label. + # DUMMY = auto # enum.auto does not exist in Python 3.5. \ No newline at end of file