import math import random from copy import deepcopy from functools import total_ordering from queue import PriorityQueue import numpy as np from scipy.linalg import LinAlgError, cho_solve, cholesky, solve_triangular from scipy.optimize import linear_sum_assignment from sklearn.metrics.pairwise import rbf_kernel from .graph_transformer import transform from .layers import is_layer from utils import Constant, OptimizeMode import logging logger = logging.getLogger(__name__) # equation(6) dl def layer_distance(a, b): """The distance between two layers.""" # pylint: disable=unidiomatic-typecheck if not isinstance(a, type(b)): return 1.0 if is_layer(a, "Conv"): att_diff = [ (a.filters, b.filters), (a.kernel_size, b.kernel_size), (a.stride, b.stride), ] return attribute_difference(att_diff) if is_layer(a, "Pooling"): att_diff = [ (a.padding, b.padding), (a.kernel_size, b.kernel_size), (a.stride, b.stride), ] return attribute_difference(att_diff) return 0.0 # equation(6) def attribute_difference(att_diff): ''' The attribute distance. ''' ret = 0 for a_value, b_value in att_diff: if max(a_value, b_value) == 0: ret += 0 else: ret += abs(a_value - b_value) * 1.0 / max(a_value, b_value) return ret * 1.0 / len(att_diff) # equation(7) A def layers_distance(list_a, list_b): """The distance between the layers of two neural networks.""" len_a = len(list_a) len_b = len(list_b) f = np.zeros((len_a + 1, len_b + 1)) f[-1][-1] = 0 for i in range(-1, len_a): f[i][-1] = i + 1 for j in range(-1, len_b): f[-1][j] = j + 1 for i in range(len_a): for j in range(len_b): f[i][j] = min( f[i][j - 1] + 1, f[i - 1][j] + 1, f[i - 1][j - 1] + layer_distance(list_a[i], list_b[j]), ) return f[len_a - 1][len_b - 1] # equation (9) ds # 0: topo rank of the start, 1: rank of the end def skip_connection_distance(a, b): """The distance between two skip-connections.""" if a[2] != b[2]: return 1.0 len_a = abs(a[1] - a[0]) len_b = abs(b[1] - b[0]) return (abs(a[0] - b[0]) + abs(len_a - len_b)) / \ (max(a[0], b[0]) + max(len_a, len_b)) # equation (8) Ds # convert equation (8) minimization part into a bipartite graph matching problem and solved by hungarian algorithm(linear_sum_assignment) def skip_connections_distance(list_a, list_b): """The distance between the skip-connections of two neural networks.""" distance_matrix = np.zeros((len(list_a), len(list_b))) for i, a in enumerate(list_a): for j, b in enumerate(list_b): distance_matrix[i][j] = skip_connection_distance(a, b) return distance_matrix[linear_sum_assignment(distance_matrix)].sum() + abs( len(list_a) - len(list_b) ) # equation (4) def edit_distance(x, y): """The distance between two neural networks. Args: x: An instance of NetworkDescriptor. y: An instance of NetworkDescriptor Returns: The edit-distance between x and y. """ ret = layers_distance(x.layers, y.layers) ret += Constant.KERNEL_LAMBDA * skip_connections_distance( x.skip_connections, y.skip_connections ) return ret class IncrementalGaussianProcess: """Gaussian process regressor. Attributes: alpha: A hyperparameter. """ def __init__(self): self.alpha = 1e-10 self._distance_matrix = None self._x = None self._y = None self._first_fitted = False self._l_matrix = None self._alpha_vector = None @property def kernel_matrix(self): ''' Kernel matric. ''' return self._distance_matrix def fit(self, train_x, train_y): """ Fit the regressor with more data. Args: train_x: A list of NetworkDescriptor. train_y: A list of metric values. """ if self.first_fitted: self.incremental_fit(train_x, train_y) else: self.first_fit(train_x, train_y) # compute the kernel matrix k, alpha_vector # 和first fit区别就是需要加入新的训练样本扩充distance matrix def incremental_fit(self, train_x, train_y): """ Incrementally fit the regressor. """ if not self._first_fitted: raise ValueError( "The first_fit function needs to be called first.") train_x, train_y = np.array(train_x), np.array(train_y) # Incrementally compute K up_right_k = edit_distance_matrix(self._x, train_x) down_left_k = np.transpose(up_right_k) down_right_k = edit_distance_matrix(train_x) up_k = np.concatenate((self._distance_matrix, up_right_k), axis=1) down_k = np.concatenate((down_left_k, down_right_k), axis=1) temp_distance_matrix = np.concatenate((up_k, down_k), axis=0) k_matrix = bourgain_embedding_matrix(temp_distance_matrix) diagonal = np.diag_indices_from(k_matrix) diagonal = (diagonal[0][-len(train_x):], diagonal[1][-len(train_x):]) k_matrix[diagonal] += self.alpha try: self._l_matrix = cholesky(k_matrix, lower=True) # Line 2 except LinAlgError as err: logger.error('LinAlgError') return self self._x = np.concatenate((self._x, train_x), axis=0) self._y = np.concatenate((self._y, train_y), axis=0) self._distance_matrix = temp_distance_matrix self._alpha_vector = cho_solve( (self._l_matrix, True), self._y) # Line 3 return self @property def first_fitted(self): ''' if it is firsr fitted ''' return self._first_fitted # update过程,第一次fit。 def first_fit(self, train_x, train_y): """ Fit the regressor for the first time. """ train_x, train_y = np.array(train_x), np.array(train_y) self._x = np.copy(train_x) self._y = np.copy(train_y) self._distance_matrix = edit_distance_matrix(self._x) k_matrix = bourgain_embedding_matrix(self._distance_matrix) k_matrix[np.diag_indices_from(k_matrix)] += self.alpha self._l_matrix = cholesky(k_matrix, lower=True) # Line 2 # cho_solve Ax = b return x = A^{-1}b self._alpha_vector = cho_solve( (self._l_matrix, True), self._y) # Line 3 self._first_fitted = True return self # 获得 predictive distribution 的 mean & std def predict(self, train_x): """Predict the result. Args: train_x: A list of NetworkDescriptor. Returns: y_mean: The predicted mean. y_std: The predicted standard deviation. """ k_trans = np.exp(-np.power(edit_distance_matrix(train_x, self._x), 2)) y_mean = k_trans.dot(self._alpha_vector) # Line 4 (y_mean = f_star) # compute inverse K_inv of K based on its Cholesky # decomposition L and its inverse L_inv l_inv = solve_triangular( self._l_matrix.T, np.eye( self._l_matrix.shape[0])) k_inv = l_inv.dot(l_inv.T) # Compute variance of predictive distribution y_var = np.ones(len(train_x), dtype=np.float) y_var -= np.einsum("ij,ij->i", np.dot(k_trans, k_inv), k_trans) # Check if any of the variances is negative because of # numerical issues. If yes: set the variance to 0. y_var_negative = y_var < 0 if np.any(y_var_negative): y_var[y_var_negative] = 0.0 return y_mean, np.sqrt(y_var) def edit_distance_matrix(train_x, train_y=None): """Calculate the edit distance. Args: train_x: A list of neural architectures. train_y: A list of neural architectures. Returns: An edit-distance matrix. """ if train_y is None: ret = np.zeros((train_x.shape[0], train_x.shape[0])) for x_index, x in enumerate(train_x): for y_index, y in enumerate(train_x): if x_index == y_index: ret[x_index][y_index] = 0 elif x_index < y_index: ret[x_index][y_index] = edit_distance(x, y) else: ret[x_index][y_index] = ret[y_index][x_index] return ret ret = np.zeros((train_x.shape[0], train_y.shape[0])) for x_index, x in enumerate(train_x): for y_index, y in enumerate(train_y): ret[x_index][y_index] = edit_distance(x, y) return ret def vector_distance(a, b): """The Euclidean distance between two vectors.""" a = np.array(a) b = np.array(b) return np.linalg.norm(a - b) # 从edit-distance矩阵空间到欧几里得空间的映射 def bourgain_embedding_matrix(distance_matrix): """Use Bourgain algorithm to embed the neural architectures based on their edit-distance. Args: distance_matrix: A matrix of edit-distances. Returns: A matrix of distances after embedding. """ distance_matrix = np.array(distance_matrix) n = len(distance_matrix) if n == 1: return distance_matrix np.random.seed(123) distort_elements = [] r = range(n) k = int(math.ceil(math.log(n) / math.log(2) - 1)) t = int(math.ceil(math.log(n))) counter = 0 for i in range(0, k + 1): for t in range(t): s = np.random.choice(r, 2 ** i) for j in r: d = min([distance_matrix[j][s] for s in s]) counter += len(s) if i == 0 and t == 0: distort_elements.append([d]) else: distort_elements[j].append(d) return rbf_kernel(distort_elements, distort_elements) class BayesianOptimizer: """ A Bayesian optimizer for neural architectures. Attributes: searcher: The Searcher which is calling the Bayesian optimizer. t_min: The minimum temperature for simulated annealing. metric: An instance of the Metric subclasses. gpr: A GaussianProcessRegressor for bayesian optimization. beta: The beta in acquisition function. (refer to our paper) search_tree: The network morphism search tree. """ def __init__(self, searcher, t_min, optimizemode, beta=None): self.searcher = searcher self.t_min = t_min self.optimizemode = optimizemode self.gpr = IncrementalGaussianProcess() self.beta = beta if beta is not None else Constant.BETA self.search_tree = SearchTree() def fit(self, x_queue, y_queue): """ Fit the optimizer with new architectures and performances. Args: x_queue: A list of NetworkDescriptor. y_queue: A list of metric values. """ self.gpr.fit(x_queue, y_queue) # Algorithm 1 # optimize acquisition function def generate(self, descriptors): """Generate new architecture. Args: descriptors: All the searched neural architectures. (search history) Returns: graph: An instance of Graph. A morphed neural network with weights. father_id: The father node ID in the search tree. """ model_ids = self.search_tree.adj_list.keys() target_graph = None father_id = None descriptors = deepcopy(descriptors) elem_class = Elem if self.optimizemode is OptimizeMode.Maximize: elem_class = ReverseElem ''' 1.初始化优先队列 2.优先队列里面元素为之前所有生成的模型 ''' pq = PriorityQueue() temp_list = [] for model_id in model_ids: metric_value = self.searcher.get_metric_value_by_id(model_id) temp_list.append((metric_value, model_id)) temp_list = sorted(temp_list) for metric_value, model_id in temp_list: graph = self.searcher.load_model_by_id(model_id) graph.clear_operation_history() graph.clear_weights() # 已经产生的模型father_id就是自己的id pq.put(elem_class(metric_value, model_id, graph)) t = 1.0 t_min = self.t_min alpha = 0.9 opt_acq = self._get_init_opt_acq_value() num_iter = 0 # logger.info('initial queue size ', pq.qsize()) while not pq.empty() and t > t_min: num_iter += 1 elem = pq.get() # logger.info("elem.metric_value:{}".format(elem.metric_value)) # logger.info("opt_acq:{}".format(opt_acq)) if self.optimizemode is OptimizeMode.Maximize: temp_exp = min((elem.metric_value - opt_acq) / t, 1.0) else: temp_exp = min((opt_acq - elem.metric_value) / t, 1.0) # logger.info("temp_exp this round ", temp_exp) ap = math.exp(temp_exp) # logger.info("ap this round ", ap) if ap >= random.uniform(0, 1): # line 9,10 in algorithm 1 for temp_graph in transform(elem.graph): # 已经出现过的网络不加入 if contain(descriptors, temp_graph.extract_descriptor()): continue #用acq作为贝叶斯模型给出的评价标准 temp_acq_value = self.acq(temp_graph) # 这个优先队列会不断增长,就算transform出来的网络也会进入。 pq.put( # 记住这个模型是从哪个father生长出来的 elem_class( temp_acq_value, elem.father_id, temp_graph)) # logger.info('temp_acq_value ', temp_acq_value) # logger.info('queue size ', pq.qsize()) descriptors.append(temp_graph.extract_descriptor()) # 选一个最好的当父 if self._accept_new_acq_value(opt_acq, temp_acq_value): opt_acq = temp_acq_value father_id = elem.father_id target_graph = deepcopy(temp_graph) t *= alpha # logger.info('number of iter in this search {}'.format(num_iter)) # Did not found a not duplicated architecture if father_id is None: return None, None nm_graph = self.searcher.load_model_by_id(father_id) # 从当前父graph开始,根据target_graph中的operation_history,一步步从当前父网络操作到target_graph # 因为在存入pq时进行了clear_operation_history()操作。等于target_graph中只存了从当前父网络到target_graph的操作 # 而nm_graph中的operation_history保存完整的,到基类的history for args in target_graph.operation_history: getattr(nm_graph, args[0])(*list(args[1:])) # target space return nm_graph, father_id # equation (10) def acq(self, graph): ''' estimate the value of generated graph ''' mean, std = self.gpr.predict(np.array([graph.extract_descriptor()])) if self.optimizemode is OptimizeMode.Maximize: return mean + self.beta * std return mean - self.beta * std def _get_init_opt_acq_value(self): if self.optimizemode is OptimizeMode.Maximize: return -np.inf return np.inf def _accept_new_acq_value(self, opt_acq, temp_acq_value): if temp_acq_value > opt_acq and self.optimizemode is OptimizeMode.Maximize: return True if temp_acq_value < opt_acq and not self.optimizemode is OptimizeMode.Maximize: return True return False def add_child(self, father_id, model_id): ''' add child to the search tree Arguments: father_id {int} -- father id model_id {int} -- model id ''' self.search_tree.add_child(father_id, model_id) @total_ordering class Elem: """Elements to be sorted according to metric value.""" def __init__(self, metric_value, father_id, graph): self.father_id = father_id self.graph = graph self.metric_value = metric_value def __eq__(self, other): return self.metric_value == other.metric_value def __lt__(self, other): return self.metric_value < other.metric_value class ReverseElem(Elem): """Elements to be reversely sorted according to metric value.""" def __lt__(self, other): return self.metric_value > other.metric_value def contain(descriptors, target_descriptor): """Check if the target descriptor is in the descriptors.""" for descriptor in descriptors: if edit_distance(descriptor, target_descriptor) < 1e-5: return True return False class SearchTree: """The network morphism search tree.""" def __init__(self): self.root = None self.adj_list = {} def add_child(self, u, v): ''' add child to search tree itself. Arguments: u {int} -- father id v {int} -- child id ''' if u == -1: self.root = v self.adj_list[v] = [] return if v not in self.adj_list[u]: self.adj_list[u].append(v) if v not in self.adj_list: self.adj_list[v] = [] def get_dict(self, u=None): """ A recursive function to return the content of the tree in a dict.""" if u is None: return self.get_dict(self.root) children = [] for v in self.adj_list[u]: children.append(self.get_dict(v)) ret = {"name": u, "children": children} return ret