|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517 |
-
- import math
- import random
- from copy import deepcopy
- from functools import total_ordering
- from queue import PriorityQueue
-
- import numpy as np
- from scipy.linalg import LinAlgError, cho_solve, cholesky, solve_triangular
- from scipy.optimize import linear_sum_assignment
- from sklearn.metrics.pairwise import rbf_kernel
-
- from .graph_transformer import transform
- from .layers import is_layer
- from utils import Constant, OptimizeMode
- import logging
-
- logger = logging.getLogger(__name__)
-
- # equation(6) dl
- def layer_distance(a, b):
- """The distance between two layers."""
- # pylint: disable=unidiomatic-typecheck
- if not isinstance(a, type(b)):
- return 1.0
- if is_layer(a, "Conv"):
- att_diff = [
- (a.filters, b.filters),
- (a.kernel_size, b.kernel_size),
- (a.stride, b.stride),
- ]
- return attribute_difference(att_diff)
- if is_layer(a, "Pooling"):
- att_diff = [
- (a.padding, b.padding),
- (a.kernel_size, b.kernel_size),
- (a.stride, b.stride),
- ]
- return attribute_difference(att_diff)
- return 0.0
-
- # equation(6)
- def attribute_difference(att_diff):
- ''' The attribute distance.
- '''
-
- ret = 0
- for a_value, b_value in att_diff:
- if max(a_value, b_value) == 0:
- ret += 0
- else:
- ret += abs(a_value - b_value) * 1.0 / max(a_value, b_value)
- return ret * 1.0 / len(att_diff)
-
- # equation(7) A
- def layers_distance(list_a, list_b):
- """The distance between the layers of two neural networks."""
- len_a = len(list_a)
- len_b = len(list_b)
- f = np.zeros((len_a + 1, len_b + 1))
- f[-1][-1] = 0
- for i in range(-1, len_a):
- f[i][-1] = i + 1
- for j in range(-1, len_b):
- f[-1][j] = j + 1
- for i in range(len_a):
- for j in range(len_b):
- f[i][j] = min(
- f[i][j - 1] + 1,
- f[i - 1][j] + 1,
- f[i - 1][j - 1] + layer_distance(list_a[i], list_b[j]),
- )
- return f[len_a - 1][len_b - 1]
-
- # equation (9) ds
- # 0: topo rank of the start, 1: rank of the end
- def skip_connection_distance(a, b):
- """The distance between two skip-connections."""
- if a[2] != b[2]:
- return 1.0
- len_a = abs(a[1] - a[0])
- len_b = abs(b[1] - b[0])
- return (abs(a[0] - b[0]) + abs(len_a - len_b)) / \
- (max(a[0], b[0]) + max(len_a, len_b))
-
- # equation (8) Ds
- # convert equation (8) minimization part into a bipartite graph matching problem and solved by hungarian algorithm(linear_sum_assignment)
- def skip_connections_distance(list_a, list_b):
- """The distance between the skip-connections of two neural networks."""
- distance_matrix = np.zeros((len(list_a), len(list_b)))
- for i, a in enumerate(list_a):
- for j, b in enumerate(list_b):
- distance_matrix[i][j] = skip_connection_distance(a, b)
- return distance_matrix[linear_sum_assignment(distance_matrix)].sum() + abs(
- len(list_a) - len(list_b)
- )
-
- # equation (4)
- def edit_distance(x, y):
- """The distance between two neural networks.
- Args:
- x: An instance of NetworkDescriptor.
- y: An instance of NetworkDescriptor
- Returns:
- The edit-distance between x and y.
- """
-
- ret = layers_distance(x.layers, y.layers)
- ret += Constant.KERNEL_LAMBDA * skip_connections_distance(
- x.skip_connections, y.skip_connections
- )
- return ret
-
-
- class IncrementalGaussianProcess:
- """Gaussian process regressor.
- Attributes:
- alpha: A hyperparameter.
- """
-
- def __init__(self):
- self.alpha = 1e-10
- self._distance_matrix = None
- self._x = None
- self._y = None
- self._first_fitted = False
- self._l_matrix = None
- self._alpha_vector = None
-
- @property
- def kernel_matrix(self):
- ''' Kernel matric.
- '''
- return self._distance_matrix
-
- def fit(self, train_x, train_y):
- """ Fit the regressor with more data.
- Args:
- train_x: A list of NetworkDescriptor.
- train_y: A list of metric values.
- """
- if self.first_fitted:
- self.incremental_fit(train_x, train_y)
- else:
- self.first_fit(train_x, train_y)
-
- # compute the kernel matrix k, alpha_vector
- # 和first fit区别就是需要加入新的训练样本扩充distance matrix
- def incremental_fit(self, train_x, train_y):
- """ Incrementally fit the regressor. """
- if not self._first_fitted:
- raise ValueError(
- "The first_fit function needs to be called first.")
-
- train_x, train_y = np.array(train_x), np.array(train_y)
-
- # Incrementally compute K
- up_right_k = edit_distance_matrix(self._x, train_x)
- down_left_k = np.transpose(up_right_k)
- down_right_k = edit_distance_matrix(train_x)
- up_k = np.concatenate((self._distance_matrix, up_right_k), axis=1)
- down_k = np.concatenate((down_left_k, down_right_k), axis=1)
- temp_distance_matrix = np.concatenate((up_k, down_k), axis=0)
-
- k_matrix = bourgain_embedding_matrix(temp_distance_matrix)
-
- diagonal = np.diag_indices_from(k_matrix)
- diagonal = (diagonal[0][-len(train_x):], diagonal[1][-len(train_x):])
- k_matrix[diagonal] += self.alpha
- try:
- self._l_matrix = cholesky(k_matrix, lower=True) # Line 2
- except LinAlgError as err:
- logger.error('LinAlgError')
- return self
-
- self._x = np.concatenate((self._x, train_x), axis=0)
- self._y = np.concatenate((self._y, train_y), axis=0)
- self._distance_matrix = temp_distance_matrix
- self._alpha_vector = cho_solve(
- (self._l_matrix, True), self._y) # Line 3
-
- return self
-
- @property
- def first_fitted(self):
- ''' if it is firsr fitted
- '''
- return self._first_fitted
-
- # update过程,第一次fit。
- def first_fit(self, train_x, train_y):
- """ Fit the regressor for the first time. """
- train_x, train_y = np.array(train_x), np.array(train_y)
-
- self._x = np.copy(train_x)
- self._y = np.copy(train_y)
-
- self._distance_matrix = edit_distance_matrix(self._x)
- k_matrix = bourgain_embedding_matrix(self._distance_matrix)
- k_matrix[np.diag_indices_from(k_matrix)] += self.alpha
-
- self._l_matrix = cholesky(k_matrix, lower=True) # Line 2
-
- # cho_solve Ax = b return x = A^{-1}b
- self._alpha_vector = cho_solve(
- (self._l_matrix, True), self._y) # Line 3
-
- self._first_fitted = True
- return self
-
- # 获得 predictive distribution 的 mean & std
- def predict(self, train_x):
- """Predict the result.
- Args:
- train_x: A list of NetworkDescriptor.
- Returns:
- y_mean: The predicted mean.
- y_std: The predicted standard deviation.
- """
- k_trans = np.exp(-np.power(edit_distance_matrix(train_x, self._x), 2))
- y_mean = k_trans.dot(self._alpha_vector) # Line 4 (y_mean = f_star)
-
- # compute inverse K_inv of K based on its Cholesky
- # decomposition L and its inverse L_inv
- l_inv = solve_triangular(
- self._l_matrix.T, np.eye(
- self._l_matrix.shape[0]))
- k_inv = l_inv.dot(l_inv.T)
- # Compute variance of predictive distribution
- y_var = np.ones(len(train_x), dtype=np.float)
- y_var -= np.einsum("ij,ij->i", np.dot(k_trans, k_inv), k_trans)
-
- # Check if any of the variances is negative because of
- # numerical issues. If yes: set the variance to 0.
- y_var_negative = y_var < 0
- if np.any(y_var_negative):
- y_var[y_var_negative] = 0.0
- return y_mean, np.sqrt(y_var)
-
-
- def edit_distance_matrix(train_x, train_y=None):
- """Calculate the edit distance.
- Args:
- train_x: A list of neural architectures.
- train_y: A list of neural architectures.
- Returns:
- An edit-distance matrix.
- """
- if train_y is None:
- ret = np.zeros((train_x.shape[0], train_x.shape[0]))
- for x_index, x in enumerate(train_x):
- for y_index, y in enumerate(train_x):
- if x_index == y_index:
- ret[x_index][y_index] = 0
- elif x_index < y_index:
- ret[x_index][y_index] = edit_distance(x, y)
- else:
- ret[x_index][y_index] = ret[y_index][x_index]
- return ret
- ret = np.zeros((train_x.shape[0], train_y.shape[0]))
- for x_index, x in enumerate(train_x):
- for y_index, y in enumerate(train_y):
- ret[x_index][y_index] = edit_distance(x, y)
- return ret
-
-
- def vector_distance(a, b):
- """The Euclidean distance between two vectors."""
- a = np.array(a)
- b = np.array(b)
- return np.linalg.norm(a - b)
-
- # 从edit-distance矩阵空间到欧几里得空间的映射
- def bourgain_embedding_matrix(distance_matrix):
- """Use Bourgain algorithm to embed the neural architectures based on their edit-distance.
- Args:
- distance_matrix: A matrix of edit-distances.
- Returns:
- A matrix of distances after embedding.
- """
- distance_matrix = np.array(distance_matrix)
- n = len(distance_matrix)
- if n == 1:
- return distance_matrix
- np.random.seed(123)
- distort_elements = []
- r = range(n)
- k = int(math.ceil(math.log(n) / math.log(2) - 1))
- t = int(math.ceil(math.log(n)))
- counter = 0
- for i in range(0, k + 1):
- for t in range(t):
- s = np.random.choice(r, 2 ** i)
- for j in r:
- d = min([distance_matrix[j][s] for s in s])
- counter += len(s)
- if i == 0 and t == 0:
- distort_elements.append([d])
- else:
- distort_elements[j].append(d)
- return rbf_kernel(distort_elements, distort_elements)
-
-
- class BayesianOptimizer:
- """ A Bayesian optimizer for neural architectures.
- Attributes:
- searcher: The Searcher which is calling the Bayesian optimizer.
- t_min: The minimum temperature for simulated annealing.
- metric: An instance of the Metric subclasses.
- gpr: A GaussianProcessRegressor for bayesian optimization.
- beta: The beta in acquisition function. (refer to our paper)
- search_tree: The network morphism search tree.
- """
-
- def __init__(self, searcher, t_min, optimizemode, beta=None):
- self.searcher = searcher
- self.t_min = t_min
- self.optimizemode = optimizemode
- self.gpr = IncrementalGaussianProcess()
- self.beta = beta if beta is not None else Constant.BETA
- self.search_tree = SearchTree()
-
- def fit(self, x_queue, y_queue):
- """ Fit the optimizer with new architectures and performances.
- Args:
- x_queue: A list of NetworkDescriptor.
- y_queue: A list of metric values.
- """
- self.gpr.fit(x_queue, y_queue)
-
- # Algorithm 1
- # optimize acquisition function
- def generate(self, descriptors):
- """Generate new architecture.
- Args:
- descriptors: All the searched neural architectures. (search history)
- Returns:
- graph: An instance of Graph. A morphed neural network with weights.
- father_id: The father node ID in the search tree.
- """
- model_ids = self.search_tree.adj_list.keys()
-
- target_graph = None
- father_id = None
- descriptors = deepcopy(descriptors)
- elem_class = Elem
- if self.optimizemode is OptimizeMode.Maximize:
- elem_class = ReverseElem
-
- '''
- 1.初始化优先队列
- 2.优先队列里面元素为之前所有生成的模型
- '''
- pq = PriorityQueue()
- temp_list = []
- for model_id in model_ids:
- metric_value = self.searcher.get_metric_value_by_id(model_id)
- temp_list.append((metric_value, model_id))
- temp_list = sorted(temp_list)
- for metric_value, model_id in temp_list:
- graph = self.searcher.load_model_by_id(model_id)
- graph.clear_operation_history()
- graph.clear_weights()
- # 已经产生的模型father_id就是自己的id
- pq.put(elem_class(metric_value, model_id, graph))
-
- t = 1.0
- t_min = self.t_min
- alpha = 0.9
- opt_acq = self._get_init_opt_acq_value()
- num_iter = 0
- # logger.info('initial queue size ', pq.qsize())
- while not pq.empty() and t > t_min:
- num_iter += 1
- elem = pq.get()
- # logger.info("elem.metric_value:{}".format(elem.metric_value))
- # logger.info("opt_acq:{}".format(opt_acq))
- if self.optimizemode is OptimizeMode.Maximize:
- temp_exp = min((elem.metric_value - opt_acq) / t, 1.0)
- else:
- temp_exp = min((opt_acq - elem.metric_value) / t, 1.0)
- # logger.info("temp_exp this round ", temp_exp)
- ap = math.exp(temp_exp)
- # logger.info("ap this round ", ap)
- if ap >= random.uniform(0, 1):
- # line 9,10 in algorithm 1
- for temp_graph in transform(elem.graph):
- # 已经出现过的网络不加入
- if contain(descriptors, temp_graph.extract_descriptor()):
- continue
-
- #用acq作为贝叶斯模型给出的评价标准
- temp_acq_value = self.acq(temp_graph)
-
- # 这个优先队列会不断增长,就算transform出来的网络也会进入。
- pq.put(
- # 记住这个模型是从哪个father生长出来的
- elem_class(
- temp_acq_value,
- elem.father_id,
- temp_graph))
- # logger.info('temp_acq_value ', temp_acq_value)
- # logger.info('queue size ', pq.qsize())
- descriptors.append(temp_graph.extract_descriptor())
- # 选一个最好的当父
- if self._accept_new_acq_value(opt_acq, temp_acq_value):
- opt_acq = temp_acq_value
- father_id = elem.father_id
- target_graph = deepcopy(temp_graph)
- t *= alpha
- # logger.info('number of iter in this search {}'.format(num_iter))
- # Did not found a not duplicated architecture
- if father_id is None:
- return None, None
- nm_graph = self.searcher.load_model_by_id(father_id)
- # 从当前父graph开始,根据target_graph中的operation_history,一步步从当前父网络操作到target_graph
- # 因为在存入pq时进行了clear_operation_history()操作。等于target_graph中只存了从当前父网络到target_graph的操作
- # 而nm_graph中的operation_history保存完整的,到基类的history
- for args in target_graph.operation_history:
- getattr(nm_graph, args[0])(*list(args[1:]))
- # target space
- return nm_graph, father_id
-
- # equation (10)
- def acq(self, graph):
- ''' estimate the value of generated graph
- '''
- mean, std = self.gpr.predict(np.array([graph.extract_descriptor()]))
- if self.optimizemode is OptimizeMode.Maximize:
- return mean + self.beta * std
- return mean - self.beta * std
-
- def _get_init_opt_acq_value(self):
- if self.optimizemode is OptimizeMode.Maximize:
- return -np.inf
- return np.inf
-
- def _accept_new_acq_value(self, opt_acq, temp_acq_value):
- if temp_acq_value > opt_acq and self.optimizemode is OptimizeMode.Maximize:
- return True
- if temp_acq_value < opt_acq and not self.optimizemode is OptimizeMode.Maximize:
- return True
- return False
-
- def add_child(self, father_id, model_id):
- ''' add child to the search tree
- Arguments:
- father_id {int} -- father id
- model_id {int} -- model id
- '''
-
- self.search_tree.add_child(father_id, model_id)
-
-
- @total_ordering
- class Elem:
- """Elements to be sorted according to metric value."""
-
- def __init__(self, metric_value, father_id, graph):
- self.father_id = father_id
- self.graph = graph
- self.metric_value = metric_value
-
- def __eq__(self, other):
- return self.metric_value == other.metric_value
-
- def __lt__(self, other):
- return self.metric_value < other.metric_value
-
-
- class ReverseElem(Elem):
- """Elements to be reversely sorted according to metric value."""
-
- def __lt__(self, other):
- return self.metric_value > other.metric_value
-
-
- def contain(descriptors, target_descriptor):
- """Check if the target descriptor is in the descriptors."""
- for descriptor in descriptors:
- if edit_distance(descriptor, target_descriptor) < 1e-5:
- return True
- return False
-
-
- class SearchTree:
- """The network morphism search tree."""
-
- def __init__(self):
- self.root = None
- self.adj_list = {}
-
- def add_child(self, u, v):
- ''' add child to search tree itself.
- Arguments:
- u {int} -- father id
- v {int} -- child id
- '''
-
- if u == -1:
- self.root = v
- self.adj_list[v] = []
- return
- if v not in self.adj_list[u]:
- self.adj_list[u].append(v)
- if v not in self.adj_list:
- self.adj_list[v] = []
-
- def get_dict(self, u=None):
- """ A recursive function to return the content of the tree in a dict."""
- if u is None:
- return self.get_dict(self.root)
- children = []
- for v in self.adj_list[u]:
- children.append(self.get_dict(v))
- ret = {"name": u, "children": children}
- return ret
|