You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

common_feature.py 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291
  1. """
  2. Copyright 2020 Tianshu AI Platform. All Rights Reserved.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. =============================================================
  13. """
  14. from kamal.core.engine.engine import Engine
  15. from kamal.core.engine.hooks import FeatureHook
  16. from kamal.core import tasks
  17. from kamal.utils import set_mode, move_to_device
  18. import torch
  19. import torch.nn as nn
  20. import torch.nn.functional as F
  21. import typing, time
  22. import numpy as np
  23. def conv3x3(in_planes, out_planes, stride=1):
  24. """3x3 convolution with padding"""
  25. return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
  26. padding=1, bias=False)
  27. class ResBlock(nn.Module):
  28. """ Residual Blocks
  29. """
  30. def __init__(self, inplanes, planes, stride=1, momentum=0.1):
  31. super(ResBlock, self).__init__()
  32. self.conv1 = conv3x3(inplanes, planes, stride)
  33. self.bn1 = nn.BatchNorm2d(planes, momentum=momentum)
  34. self.relu = nn.ReLU(inplace=True)
  35. self.conv2 = conv3x3(planes, planes)
  36. self.bn2 = nn.BatchNorm2d(planes, momentum=momentum)
  37. if stride > 1 or inplanes != planes:
  38. self.downsample = nn.Sequential(
  39. nn.Conv2d(inplanes, planes, kernel_size=1,
  40. stride=stride, bias=False),
  41. nn.BatchNorm2d(planes, momentum=momentum)
  42. )
  43. else:
  44. self.downsample = None
  45. self.stride = stride
  46. def forward(self, x):
  47. residual = x
  48. out = self.conv1(x)
  49. out = self.bn1(out)
  50. out = self.relu(out)
  51. out = self.conv2(out)
  52. out = self.bn2(out)
  53. if self.downsample is not None:
  54. residual = self.downsample(x)
  55. out += residual
  56. out = self.relu(out)
  57. return out
  58. class CFL_FCBlock(nn.Module):
  59. """Common Feature Blocks for Fully-Connected layer
  60. This module is used to capture the common features of multiple teachers and calculate mmd with features of student.
  61. **Parameters:**
  62. - cs (int): channel number of student features
  63. - channel_ts (list or tuple): channel number list of teacher features
  64. - ch (int): channel number of hidden features
  65. """
  66. def __init__(self, cs, cts, ch, k_size=5):
  67. super(CFL_FCBlock, self).__init__()
  68. self.align_t = nn.ModuleList()
  69. for ct in cts:
  70. self.align_t.append(
  71. nn.Sequential(
  72. nn.Linear(ct, ch),
  73. nn.ReLU(inplace=True)
  74. )
  75. )
  76. self.align_s = nn.Sequential(
  77. nn.Linear(cs, ch),
  78. nn.ReLU(inplace=True),
  79. )
  80. self.extractor = nn.Sequential(
  81. nn.Linear(ch, ch),
  82. nn.ReLU(),
  83. nn.Linear(ch, ch),
  84. )
  85. self.dec_t = nn.ModuleList()
  86. for ct in cts:
  87. self.dec_t.append(
  88. nn.Sequential(
  89. nn.Linear(ch, ct),
  90. nn.ReLU(inplace=True),
  91. nn.Linear(ct, ct)
  92. )
  93. )
  94. def init_weights(self):
  95. for m in self.modules():
  96. if isinstance(m, nn.Linear):
  97. torch.nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
  98. elif isinstance(m, nn.BatchNorm2d):
  99. m.weight.data.fill_(1)
  100. m.bias.data.zero_()
  101. def forward(self, fs, fts):
  102. aligned_t = [self.align_t[i](fts[i]) for i in range(len(fts))]
  103. aligned_s = self.align_s(fs)
  104. hts = [self.extractor(f) for f in aligned_t]
  105. hs = self.extractor(aligned_s)
  106. _fts = [self.dec_t[i](hts[i]) for i in range(len(hts))]
  107. return (hs, hts), (_fts, fts)
  108. class CFL_ConvBlock(nn.Module):
  109. """Common Feature Blocks for Convolutional layer
  110. This module is used to capture the common features of multiple teachers and calculate mmd with features of student.
  111. **Parameters:**
  112. - cs (int): channel number of student features
  113. - channel_ts (list or tuple): channel number list of teacher features
  114. - ch (int): channel number of hidden features
  115. """
  116. def __init__(self, cs, cts, ch, k_size=5):
  117. super(CFL_ConvBlock, self).__init__()
  118. self.align_t = nn.ModuleList()
  119. for ct in cts:
  120. self.align_t.append(
  121. nn.Sequential(
  122. nn.Conv2d(in_channels=ct, out_channels=ch,
  123. kernel_size=1),
  124. nn.BatchNorm2d(ch),
  125. nn.ReLU(inplace=True)
  126. )
  127. )
  128. self.align_s = nn.Sequential(
  129. nn.Conv2d(in_channels=cs, out_channels=ch,
  130. kernel_size=1),
  131. nn.BatchNorm2d(ch),
  132. nn.ReLU(inplace=True),
  133. )
  134. self.extractor = nn.Sequential(
  135. ResBlock(inplanes=ch, planes=ch, stride=1),
  136. ResBlock(inplanes=ch, planes=ch, stride=1),
  137. )
  138. self.dec_t = nn.ModuleList()
  139. for ct in cts:
  140. self.dec_t.append(
  141. nn.Sequential(
  142. nn.Conv2d(ch, ch, kernel_size=1, stride=1),
  143. nn.BatchNorm2d(ch),
  144. nn.ReLU(inplace=True),
  145. nn.Conv2d(ch, ct, kernel_size=1, stride=1)
  146. )
  147. )
  148. def init_weights(self):
  149. for m in self.modules():
  150. if isinstance(m, nn.Conv2d):
  151. torch.nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
  152. elif isinstance(m, nn.BatchNorm2d):
  153. m.weight.data.fill_(1)
  154. m.bias.data.zero_()
  155. def forward(self, fs, fts):
  156. aligned_t = [self.align_t[i](fts[i]) for i in range(len(fts))]
  157. aligned_s = self.align_s(fs)
  158. hts = [self.extractor(f) for f in aligned_t]
  159. hs = self.extractor(aligned_s)
  160. _fts = [self.dec_t[i](hts[i]) for i in range(len(hts))]
  161. return (hs, hts), (_fts, fts)
  162. class CommonFeatureAmalgamator(Engine):
  163. def setup(
  164. self,
  165. student,
  166. teachers,
  167. layer_groups: typing.Sequence[typing.Sequence],
  168. layer_channels: typing.Sequence[typing.Sequence],
  169. dataloader: torch.utils.data.DataLoader,
  170. optimizer: torch.optim.Optimizer,
  171. weights = [1.0, 1.0, 1.0],
  172. on_layer_input=False,
  173. device = None,
  174. ):
  175. self._dataloader = dataloader
  176. if device is None:
  177. device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu' )
  178. self._device = device
  179. self._model = self._student = student.to(self._device)
  180. self._teachers = nn.ModuleList(teachers).to(self._device)
  181. self._optimizer = optimizer
  182. self._weights = weights
  183. self._on_layer_input = on_layer_input
  184. amal_blocks = []
  185. for group, C in zip( layer_groups, layer_channels ):
  186. hooks = [ FeatureHook(layer) for layer in group ]
  187. if isinstance(group[0], nn.Linear):
  188. amal_block = CFL_FCBlock( cs=C[0], cts=C[1:], ch=C[0]//4 ).to(self._device).train()
  189. print("Building FC Blocks")
  190. else:
  191. amal_block = CFL_ConvBlock(cs=C[0], cts=C[1:], ch=C[0]//4).to(self._device).train()
  192. print("Building Conv Blocks")
  193. amal_blocks.append( (amal_block, hooks, C) )
  194. self._amal_blocks = amal_blocks
  195. self._cfl_criterion = tasks.loss.CFLLoss( sigmas=[0.001, 0.01, 0.05, 0.1, 0.2, 1, 2] )
  196. @property
  197. def device(self):
  198. return self._device
  199. def run(self, max_iter, start_iter=0, epoch_length=None):
  200. block_params = []
  201. for block, _, _ in self._amal_blocks:
  202. block_params.extend( list(block.parameters()) )
  203. if isinstance( self._optimizer, torch.optim.SGD ):
  204. self._amal_optimimizer = torch.optim.SGD( block_params, lr=self._optimizer.param_groups[0]['lr'], momentum=0.9, weight_decay=1e-4 )
  205. else:
  206. self._amal_optimimizer = torch.optim.Adam( block_params, lr=self._optimizer.param_groups[0]['lr'], weight_decay=1e-4 )
  207. self._amal_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( self._amal_optimimizer, T_max=max_iter )
  208. with set_mode(self._student, training=True), \
  209. set_mode(self._teachers, training=False):
  210. super( CommonFeatureAmalgamator, self ).run(self.step_fn, self._dataloader, start_iter=start_iter, max_iter=max_iter, epoch_length=epoch_length)
  211. def step_fn(self, engine, batch):
  212. start_time = time.perf_counter()
  213. batch = move_to_device(batch, self._device)
  214. data = batch[0]
  215. s_out = self._student( data )
  216. with torch.no_grad():
  217. t_out = [ teacher( data ) for teacher in self._teachers ]
  218. loss_amal = 0
  219. loss_recons = 0
  220. for amal_block, hooks, C in self._amal_blocks:
  221. features = [ h.feat_in if self._on_layer_input else h.feat_out for h in hooks ]
  222. fs, fts = features[0], features[1:]
  223. (hs, hts), (_fts, fts) = amal_block( fs, fts )
  224. _loss_amal, _loss_recons = self._cfl_criterion( hs, hts, _fts, fts )
  225. loss_amal += _loss_amal
  226. loss_recons += _loss_recons
  227. loss_kd = tasks.loss.kldiv( s_out, torch.cat( t_out, dim=1 ) )
  228. loss_dict = {
  229. 'loss_kd': self._weights[0]*loss_kd,
  230. 'loss_amal': self._weights[1]*loss_amal,
  231. 'loss_recons': self._weights[2]*loss_recons
  232. }
  233. loss = sum(loss_dict.values())
  234. self._optimizer.zero_grad()
  235. self._amal_optimimizer.zero_grad()
  236. loss.backward()
  237. self._optimizer.step()
  238. self._amal_optimimizer.step()
  239. self._amal_scheduler.step()
  240. step_time = time.perf_counter() - start_time
  241. metrics = { loss_name: loss_value.item() for (loss_name, loss_value) in loss_dict.items() }
  242. metrics.update({
  243. 'total_loss': loss.item(),
  244. 'step_time': step_time,
  245. 'lr': float( self._optimizer.param_groups[0]['lr'] )
  246. })
  247. return metrics

一站式算法开发平台、高性能分布式深度学习框架、先进算法模型库、视觉模型炼知平台、数据可视化分析平台等一系列平台及工具,在模型高效分布式训练、数据处理和可视分析、模型炼知和轻量化等技术上形成独特优势,目前已在产学研等各领域近千家单位及个人提供AI应用赋能

Contributors (1)