You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

bayesian.py 18 kB

2 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517
  1. import math
  2. import random
  3. from copy import deepcopy
  4. from functools import total_ordering
  5. from queue import PriorityQueue
  6. import numpy as np
  7. from scipy.linalg import LinAlgError, cho_solve, cholesky, solve_triangular
  8. from scipy.optimize import linear_sum_assignment
  9. from sklearn.metrics.pairwise import rbf_kernel
  10. from .graph_transformer import transform
  11. from .layers import is_layer
  12. from utils import Constant, OptimizeMode
  13. import logging
  14. logger = logging.getLogger(__name__)
  15. # equation(6) dl
  16. def layer_distance(a, b):
  17. """The distance between two layers."""
  18. # pylint: disable=unidiomatic-typecheck
  19. if not isinstance(a, type(b)):
  20. return 1.0
  21. if is_layer(a, "Conv"):
  22. att_diff = [
  23. (a.filters, b.filters),
  24. (a.kernel_size, b.kernel_size),
  25. (a.stride, b.stride),
  26. ]
  27. return attribute_difference(att_diff)
  28. if is_layer(a, "Pooling"):
  29. att_diff = [
  30. (a.padding, b.padding),
  31. (a.kernel_size, b.kernel_size),
  32. (a.stride, b.stride),
  33. ]
  34. return attribute_difference(att_diff)
  35. return 0.0
  36. # equation(6)
  37. def attribute_difference(att_diff):
  38. ''' The attribute distance.
  39. '''
  40. ret = 0
  41. for a_value, b_value in att_diff:
  42. if max(a_value, b_value) == 0:
  43. ret += 0
  44. else:
  45. ret += abs(a_value - b_value) * 1.0 / max(a_value, b_value)
  46. return ret * 1.0 / len(att_diff)
  47. # equation(7) A
  48. def layers_distance(list_a, list_b):
  49. """The distance between the layers of two neural networks."""
  50. len_a = len(list_a)
  51. len_b = len(list_b)
  52. f = np.zeros((len_a + 1, len_b + 1))
  53. f[-1][-1] = 0
  54. for i in range(-1, len_a):
  55. f[i][-1] = i + 1
  56. for j in range(-1, len_b):
  57. f[-1][j] = j + 1
  58. for i in range(len_a):
  59. for j in range(len_b):
  60. f[i][j] = min(
  61. f[i][j - 1] + 1,
  62. f[i - 1][j] + 1,
  63. f[i - 1][j - 1] + layer_distance(list_a[i], list_b[j]),
  64. )
  65. return f[len_a - 1][len_b - 1]
  66. # equation (9) ds
  67. # 0: topo rank of the start, 1: rank of the end
  68. def skip_connection_distance(a, b):
  69. """The distance between two skip-connections."""
  70. if a[2] != b[2]:
  71. return 1.0
  72. len_a = abs(a[1] - a[0])
  73. len_b = abs(b[1] - b[0])
  74. return (abs(a[0] - b[0]) + abs(len_a - len_b)) / \
  75. (max(a[0], b[0]) + max(len_a, len_b))
  76. # equation (8) Ds
  77. # convert equation (8) minimization part into a bipartite graph matching problem and solved by hungarian algorithm(linear_sum_assignment)
  78. def skip_connections_distance(list_a, list_b):
  79. """The distance between the skip-connections of two neural networks."""
  80. distance_matrix = np.zeros((len(list_a), len(list_b)))
  81. for i, a in enumerate(list_a):
  82. for j, b in enumerate(list_b):
  83. distance_matrix[i][j] = skip_connection_distance(a, b)
  84. return distance_matrix[linear_sum_assignment(distance_matrix)].sum() + abs(
  85. len(list_a) - len(list_b)
  86. )
  87. # equation (4)
  88. def edit_distance(x, y):
  89. """The distance between two neural networks.
  90. Args:
  91. x: An instance of NetworkDescriptor.
  92. y: An instance of NetworkDescriptor
  93. Returns:
  94. The edit-distance between x and y.
  95. """
  96. ret = layers_distance(x.layers, y.layers)
  97. ret += Constant.KERNEL_LAMBDA * skip_connections_distance(
  98. x.skip_connections, y.skip_connections
  99. )
  100. return ret
  101. class IncrementalGaussianProcess:
  102. """Gaussian process regressor.
  103. Attributes:
  104. alpha: A hyperparameter.
  105. """
  106. def __init__(self):
  107. self.alpha = 1e-10
  108. self._distance_matrix = None
  109. self._x = None
  110. self._y = None
  111. self._first_fitted = False
  112. self._l_matrix = None
  113. self._alpha_vector = None
  114. @property
  115. def kernel_matrix(self):
  116. ''' Kernel matric.
  117. '''
  118. return self._distance_matrix
  119. def fit(self, train_x, train_y):
  120. """ Fit the regressor with more data.
  121. Args:
  122. train_x: A list of NetworkDescriptor.
  123. train_y: A list of metric values.
  124. """
  125. if self.first_fitted:
  126. self.incremental_fit(train_x, train_y)
  127. else:
  128. self.first_fit(train_x, train_y)
  129. # compute the kernel matrix k, alpha_vector
  130. # 和first fit区别就是需要加入新的训练样本扩充distance matrix
  131. def incremental_fit(self, train_x, train_y):
  132. """ Incrementally fit the regressor. """
  133. if not self._first_fitted:
  134. raise ValueError(
  135. "The first_fit function needs to be called first.")
  136. train_x, train_y = np.array(train_x), np.array(train_y)
  137. # Incrementally compute K
  138. up_right_k = edit_distance_matrix(self._x, train_x)
  139. down_left_k = np.transpose(up_right_k)
  140. down_right_k = edit_distance_matrix(train_x)
  141. up_k = np.concatenate((self._distance_matrix, up_right_k), axis=1)
  142. down_k = np.concatenate((down_left_k, down_right_k), axis=1)
  143. temp_distance_matrix = np.concatenate((up_k, down_k), axis=0)
  144. k_matrix = bourgain_embedding_matrix(temp_distance_matrix)
  145. diagonal = np.diag_indices_from(k_matrix)
  146. diagonal = (diagonal[0][-len(train_x):], diagonal[1][-len(train_x):])
  147. k_matrix[diagonal] += self.alpha
  148. try:
  149. self._l_matrix = cholesky(k_matrix, lower=True) # Line 2
  150. except LinAlgError as err:
  151. logger.error('LinAlgError')
  152. return self
  153. self._x = np.concatenate((self._x, train_x), axis=0)
  154. self._y = np.concatenate((self._y, train_y), axis=0)
  155. self._distance_matrix = temp_distance_matrix
  156. self._alpha_vector = cho_solve(
  157. (self._l_matrix, True), self._y) # Line 3
  158. return self
  159. @property
  160. def first_fitted(self):
  161. ''' if it is firsr fitted
  162. '''
  163. return self._first_fitted
  164. # update过程,第一次fit。
  165. def first_fit(self, train_x, train_y):
  166. """ Fit the regressor for the first time. """
  167. train_x, train_y = np.array(train_x), np.array(train_y)
  168. self._x = np.copy(train_x)
  169. self._y = np.copy(train_y)
  170. self._distance_matrix = edit_distance_matrix(self._x)
  171. k_matrix = bourgain_embedding_matrix(self._distance_matrix)
  172. k_matrix[np.diag_indices_from(k_matrix)] += self.alpha
  173. self._l_matrix = cholesky(k_matrix, lower=True) # Line 2
  174. # cho_solve Ax = b return x = A^{-1}b
  175. self._alpha_vector = cho_solve(
  176. (self._l_matrix, True), self._y) # Line 3
  177. self._first_fitted = True
  178. return self
  179. # 获得 predictive distribution 的 mean & std
  180. def predict(self, train_x):
  181. """Predict the result.
  182. Args:
  183. train_x: A list of NetworkDescriptor.
  184. Returns:
  185. y_mean: The predicted mean.
  186. y_std: The predicted standard deviation.
  187. """
  188. k_trans = np.exp(-np.power(edit_distance_matrix(train_x, self._x), 2))
  189. y_mean = k_trans.dot(self._alpha_vector) # Line 4 (y_mean = f_star)
  190. # compute inverse K_inv of K based on its Cholesky
  191. # decomposition L and its inverse L_inv
  192. l_inv = solve_triangular(
  193. self._l_matrix.T, np.eye(
  194. self._l_matrix.shape[0]))
  195. k_inv = l_inv.dot(l_inv.T)
  196. # Compute variance of predictive distribution
  197. y_var = np.ones(len(train_x), dtype=np.float)
  198. y_var -= np.einsum("ij,ij->i", np.dot(k_trans, k_inv), k_trans)
  199. # Check if any of the variances is negative because of
  200. # numerical issues. If yes: set the variance to 0.
  201. y_var_negative = y_var < 0
  202. if np.any(y_var_negative):
  203. y_var[y_var_negative] = 0.0
  204. return y_mean, np.sqrt(y_var)
  205. def edit_distance_matrix(train_x, train_y=None):
  206. """Calculate the edit distance.
  207. Args:
  208. train_x: A list of neural architectures.
  209. train_y: A list of neural architectures.
  210. Returns:
  211. An edit-distance matrix.
  212. """
  213. if train_y is None:
  214. ret = np.zeros((train_x.shape[0], train_x.shape[0]))
  215. for x_index, x in enumerate(train_x):
  216. for y_index, y in enumerate(train_x):
  217. if x_index == y_index:
  218. ret[x_index][y_index] = 0
  219. elif x_index < y_index:
  220. ret[x_index][y_index] = edit_distance(x, y)
  221. else:
  222. ret[x_index][y_index] = ret[y_index][x_index]
  223. return ret
  224. ret = np.zeros((train_x.shape[0], train_y.shape[0]))
  225. for x_index, x in enumerate(train_x):
  226. for y_index, y in enumerate(train_y):
  227. ret[x_index][y_index] = edit_distance(x, y)
  228. return ret
  229. def vector_distance(a, b):
  230. """The Euclidean distance between two vectors."""
  231. a = np.array(a)
  232. b = np.array(b)
  233. return np.linalg.norm(a - b)
  234. # 从edit-distance矩阵空间到欧几里得空间的映射
  235. def bourgain_embedding_matrix(distance_matrix):
  236. """Use Bourgain algorithm to embed the neural architectures based on their edit-distance.
  237. Args:
  238. distance_matrix: A matrix of edit-distances.
  239. Returns:
  240. A matrix of distances after embedding.
  241. """
  242. distance_matrix = np.array(distance_matrix)
  243. n = len(distance_matrix)
  244. if n == 1:
  245. return distance_matrix
  246. np.random.seed(123)
  247. distort_elements = []
  248. r = range(n)
  249. k = int(math.ceil(math.log(n) / math.log(2) - 1))
  250. t = int(math.ceil(math.log(n)))
  251. counter = 0
  252. for i in range(0, k + 1):
  253. for t in range(t):
  254. s = np.random.choice(r, 2 ** i)
  255. for j in r:
  256. d = min([distance_matrix[j][s] for s in s])
  257. counter += len(s)
  258. if i == 0 and t == 0:
  259. distort_elements.append([d])
  260. else:
  261. distort_elements[j].append(d)
  262. return rbf_kernel(distort_elements, distort_elements)
  263. class BayesianOptimizer:
  264. """ A Bayesian optimizer for neural architectures.
  265. Attributes:
  266. searcher: The Searcher which is calling the Bayesian optimizer.
  267. t_min: The minimum temperature for simulated annealing.
  268. metric: An instance of the Metric subclasses.
  269. gpr: A GaussianProcessRegressor for bayesian optimization.
  270. beta: The beta in acquisition function. (refer to our paper)
  271. search_tree: The network morphism search tree.
  272. """
  273. def __init__(self, searcher, t_min, optimizemode, beta=None):
  274. self.searcher = searcher
  275. self.t_min = t_min
  276. self.optimizemode = optimizemode
  277. self.gpr = IncrementalGaussianProcess()
  278. self.beta = beta if beta is not None else Constant.BETA
  279. self.search_tree = SearchTree()
  280. def fit(self, x_queue, y_queue):
  281. """ Fit the optimizer with new architectures and performances.
  282. Args:
  283. x_queue: A list of NetworkDescriptor.
  284. y_queue: A list of metric values.
  285. """
  286. self.gpr.fit(x_queue, y_queue)
  287. # Algorithm 1
  288. # optimize acquisition function
  289. def generate(self, descriptors):
  290. """Generate new architecture.
  291. Args:
  292. descriptors: All the searched neural architectures. (search history)
  293. Returns:
  294. graph: An instance of Graph. A morphed neural network with weights.
  295. father_id: The father node ID in the search tree.
  296. """
  297. model_ids = self.search_tree.adj_list.keys()
  298. target_graph = None
  299. father_id = None
  300. descriptors = deepcopy(descriptors)
  301. elem_class = Elem
  302. if self.optimizemode is OptimizeMode.Maximize:
  303. elem_class = ReverseElem
  304. '''
  305. 1.初始化优先队列
  306. 2.优先队列里面元素为之前所有生成的模型
  307. '''
  308. pq = PriorityQueue()
  309. temp_list = []
  310. for model_id in model_ids:
  311. metric_value = self.searcher.get_metric_value_by_id(model_id)
  312. temp_list.append((metric_value, model_id))
  313. temp_list = sorted(temp_list)
  314. for metric_value, model_id in temp_list:
  315. graph = self.searcher.load_model_by_id(model_id)
  316. graph.clear_operation_history()
  317. graph.clear_weights()
  318. # 已经产生的模型father_id就是自己的id
  319. pq.put(elem_class(metric_value, model_id, graph))
  320. t = 1.0
  321. t_min = self.t_min
  322. alpha = 0.9
  323. opt_acq = self._get_init_opt_acq_value()
  324. num_iter = 0
  325. # logger.info('initial queue size ', pq.qsize())
  326. while not pq.empty() and t > t_min:
  327. num_iter += 1
  328. elem = pq.get()
  329. # logger.info("elem.metric_value:{}".format(elem.metric_value))
  330. # logger.info("opt_acq:{}".format(opt_acq))
  331. if self.optimizemode is OptimizeMode.Maximize:
  332. temp_exp = min((elem.metric_value - opt_acq) / t, 1.0)
  333. else:
  334. temp_exp = min((opt_acq - elem.metric_value) / t, 1.0)
  335. # logger.info("temp_exp this round ", temp_exp)
  336. ap = math.exp(temp_exp)
  337. # logger.info("ap this round ", ap)
  338. if ap >= random.uniform(0, 1):
  339. # line 9,10 in algorithm 1
  340. for temp_graph in transform(elem.graph):
  341. # 已经出现过的网络不加入
  342. if contain(descriptors, temp_graph.extract_descriptor()):
  343. continue
  344. #用acq作为贝叶斯模型给出的评价标准
  345. temp_acq_value = self.acq(temp_graph)
  346. # 这个优先队列会不断增长,就算transform出来的网络也会进入。
  347. pq.put(
  348. # 记住这个模型是从哪个father生长出来的
  349. elem_class(
  350. temp_acq_value,
  351. elem.father_id,
  352. temp_graph))
  353. # logger.info('temp_acq_value ', temp_acq_value)
  354. # logger.info('queue size ', pq.qsize())
  355. descriptors.append(temp_graph.extract_descriptor())
  356. # 选一个最好的当父
  357. if self._accept_new_acq_value(opt_acq, temp_acq_value):
  358. opt_acq = temp_acq_value
  359. father_id = elem.father_id
  360. target_graph = deepcopy(temp_graph)
  361. t *= alpha
  362. # logger.info('number of iter in this search {}'.format(num_iter))
  363. # Did not found a not duplicated architecture
  364. if father_id is None:
  365. return None, None
  366. nm_graph = self.searcher.load_model_by_id(father_id)
  367. # 从当前父graph开始,根据target_graph中的operation_history,一步步从当前父网络操作到target_graph
  368. # 因为在存入pq时进行了clear_operation_history()操作。等于target_graph中只存了从当前父网络到target_graph的操作
  369. # 而nm_graph中的operation_history保存完整的,到基类的history
  370. for args in target_graph.operation_history:
  371. getattr(nm_graph, args[0])(*list(args[1:]))
  372. # target space
  373. return nm_graph, father_id
  374. # equation (10)
  375. def acq(self, graph):
  376. ''' estimate the value of generated graph
  377. '''
  378. mean, std = self.gpr.predict(np.array([graph.extract_descriptor()]))
  379. if self.optimizemode is OptimizeMode.Maximize:
  380. return mean + self.beta * std
  381. return mean - self.beta * std
  382. def _get_init_opt_acq_value(self):
  383. if self.optimizemode is OptimizeMode.Maximize:
  384. return -np.inf
  385. return np.inf
  386. def _accept_new_acq_value(self, opt_acq, temp_acq_value):
  387. if temp_acq_value > opt_acq and self.optimizemode is OptimizeMode.Maximize:
  388. return True
  389. if temp_acq_value < opt_acq and not self.optimizemode is OptimizeMode.Maximize:
  390. return True
  391. return False
  392. def add_child(self, father_id, model_id):
  393. ''' add child to the search tree
  394. Arguments:
  395. father_id {int} -- father id
  396. model_id {int} -- model id
  397. '''
  398. self.search_tree.add_child(father_id, model_id)
  399. @total_ordering
  400. class Elem:
  401. """Elements to be sorted according to metric value."""
  402. def __init__(self, metric_value, father_id, graph):
  403. self.father_id = father_id
  404. self.graph = graph
  405. self.metric_value = metric_value
  406. def __eq__(self, other):
  407. return self.metric_value == other.metric_value
  408. def __lt__(self, other):
  409. return self.metric_value < other.metric_value
  410. class ReverseElem(Elem):
  411. """Elements to be reversely sorted according to metric value."""
  412. def __lt__(self, other):
  413. return self.metric_value > other.metric_value
  414. def contain(descriptors, target_descriptor):
  415. """Check if the target descriptor is in the descriptors."""
  416. for descriptor in descriptors:
  417. if edit_distance(descriptor, target_descriptor) < 1e-5:
  418. return True
  419. return False
  420. class SearchTree:
  421. """The network morphism search tree."""
  422. def __init__(self):
  423. self.root = None
  424. self.adj_list = {}
  425. def add_child(self, u, v):
  426. ''' add child to search tree itself.
  427. Arguments:
  428. u {int} -- father id
  429. v {int} -- child id
  430. '''
  431. if u == -1:
  432. self.root = v
  433. self.adj_list[v] = []
  434. return
  435. if v not in self.adj_list[u]:
  436. self.adj_list[u].append(v)
  437. if v not in self.adj_list:
  438. self.adj_list[v] = []
  439. def get_dict(self, u=None):
  440. """ A recursive function to return the content of the tree in a dict."""
  441. if u is None:
  442. return self.get_dict(self.root)
  443. children = []
  444. for v in self.adj_list[u]:
  445. children.append(self.get_dict(v))
  446. ret = {"name": u, "children": children}
  447. return ret

一站式算法开发平台、高性能分布式深度学习框架、先进算法模型库、视觉模型炼知平台、数据可视化分析平台等一系列平台及工具,在模型高效分布式训练、数据处理和可视分析、模型炼知和轻量化等技术上形成独特优势,目前已在产学研等各领域近千家单位及个人提供AI应用赋能