GitOrigin-RevId: befb85fd43
tags/v1.7.2.m1
@@ -0,0 +1,100 @@ | |||||
/** | |||||
* \file src/opr/impl/training/dataview.cpp | |||||
* | |||||
* This file is part of MegBrain, a deep learning framework developed by Megvii. | |||||
* | |||||
* \copyright Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
* | |||||
*/ | |||||
#include "megbrain/opr/training/dataview.h" | |||||
#include "megbrain/exception.h" | |||||
#include "megbrain/opr/basic_arith_wrapper.h" | |||||
#include "megbrain/opr/io.h" | |||||
#include "megbrain/opr/tensor_manip.h" | |||||
#include "megbrain/tensor.h" | |||||
#include <random> | |||||
namespace mgb { | |||||
DataLoader::DataLoader( | |||||
std::shared_ptr<IDataView> dataview, mgb::CompNode comp_node, | |||||
unsigned long batchsize, bool shuffle, bool drop_last) | |||||
: m_dataview(dataview), | |||||
m_comp_node(comp_node), | |||||
m_batchsize(batchsize), | |||||
m_shuffle(shuffle), | |||||
m_drop_last(drop_last), | |||||
m_idx(0) { | |||||
if (!m_comp_node.valid()) { | |||||
m_comp_node = CompNode::load("xpu0"); | |||||
} | |||||
for (size_t i = 0; i < m_dataview->size(); i++) { | |||||
m_index_collection.push_back(i); | |||||
} | |||||
if (m_dataview->size() > 0) { | |||||
auto data_sample = m_dataview->get_item(0); | |||||
SmallVector<size_t> dshape; | |||||
dshape.push_back(static_cast<size_t>(batchsize)); | |||||
for (size_t i = 0; i < data_sample.first->layout().ndim; i++) { | |||||
dshape.push_back(data_sample.first->shape()[i]); | |||||
} | |||||
m_data_shape = dshape; | |||||
SmallVector<size_t> lshape; | |||||
lshape.push_back(m_batchsize); | |||||
for (size_t i = 1; i < data_sample.second->layout().ndim; i++) { | |||||
lshape.push_back(data_sample.second->shape()[i]); | |||||
} | |||||
m_label_shape = lshape; | |||||
m_data_type = data_sample.first->dtype(); | |||||
m_label_type = data_sample.second->dtype(); | |||||
} else { | |||||
mgb_throw(AssertionError, "The dataset is empty."); | |||||
} | |||||
} | |||||
size_t DataLoader::size() { | |||||
return m_dataview->size() / m_batchsize; | |||||
} | |||||
DataPair DataLoader::next() { | |||||
if (m_idx == 0 && m_shuffle) { | |||||
std::shuffle( | |||||
m_index_collection.begin(), m_index_collection.end(), | |||||
std::default_random_engine()); | |||||
} | |||||
if (m_idx >= m_index_collection.size() - m_batchsize) { | |||||
m_idx = 0; | |||||
} | |||||
auto data = std::make_shared<HostTensorND>(m_comp_node, m_data_shape, m_data_type); | |||||
auto label = | |||||
std::make_shared<HostTensorND>(m_comp_node, m_label_shape, m_label_type); | |||||
size_t data_bytes = m_dataview->get_item(m_index_collection.at(m_idx)) | |||||
.first->layout() | |||||
.access_bytes(); | |||||
size_t label_bytes = m_dataview->get_item(m_index_collection.at(m_idx)) | |||||
.second->layout() | |||||
.access_bytes(); | |||||
auto data_ptr = data->raw_ptr(); | |||||
auto label_ptr = label->raw_ptr(); | |||||
for (unsigned int i = 0; i < m_batchsize; i++) { | |||||
auto item = m_dataview->get_item(m_index_collection.at(m_idx)); | |||||
auto pre_data = item.first; | |||||
auto pre_label = item.second; | |||||
auto pre_data_ptr = pre_data->raw_ptr(); | |||||
auto pre_label_ptr = pre_label->raw_ptr(); | |||||
memcpy(data_ptr + data_bytes * i, pre_data_ptr, | |||||
sizeof(megdnn::dt_byte) * data_bytes); | |||||
memcpy(label_ptr + label_bytes * i, pre_label_ptr, | |||||
sizeof(megdnn::dt_byte) * label_bytes); | |||||
m_idx++; | |||||
} | |||||
return {data, label}; | |||||
} | |||||
} // namespace mgb |
@@ -0,0 +1,82 @@ | |||||
/** | |||||
* \file src/opr/impl/training/loss.cpp | |||||
* | |||||
* This file is part of MegBrain, a deep learning framework developed by Megvii. | |||||
* | |||||
* \copyright Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
* | |||||
*/ | |||||
#include "megbrain/opr/training/loss.h" | |||||
#include "megbrain/exception.h" | |||||
#include "megbrain/opr/indexing.h" | |||||
namespace mgb { | |||||
namespace loss { | |||||
CrossEntropyLoss::CrossEntropyLoss( | |||||
bool with_logits, float label_smooth, ReduceMode reduce_mode, int axis) | |||||
: m_with_logits(with_logits), | |||||
m_label_smooth(label_smooth), | |||||
m_reduce_mode(reduce_mode), | |||||
m_axis(axis) {} | |||||
SymbolVar CrossEntropyLoss::operator()( | |||||
mgb::SymbolVar symbol_pred, mgb::SymbolVar symbol_label) { | |||||
mgb_assert( | |||||
symbol_pred.shape().ndim >= symbol_label.shape().ndim, | |||||
"The label must have less dimensions than the pred."); | |||||
for (size_t i = 0; i < symbol_label.shape().ndim; i++) { | |||||
mgb_assert( | |||||
symbol_pred.shape()[i] == symbol_label.shape()[i] || (int)i == m_axis, | |||||
"Unmatched shape for pred and label."); | |||||
} | |||||
mgb_assert(m_label_smooth >= .0f, "The label_smmoth must be positive value"); | |||||
SymbolVar symbol_loss; | |||||
SymbolVar symbol_middle; | |||||
SymbolVar symbol_max = opr::reduce_ax_max(symbol_pred, m_axis); | |||||
SymbolVar symbol_primary_item = | |||||
opr::IndexingOneHot::make(symbol_pred, symbol_label, {m_axis}); | |||||
if (m_with_logits) { | |||||
symbol_middle = opr::reduce_ax_sum(symbol_pred, m_axis) / | |||||
opr::GetVarShape::make(symbol_pred, {m_axis}); | |||||
SymbolVar symbol_logits = | |||||
symbol_max + opr::log(opr::reduce_ax_sum( | |||||
opr::exp(symbol_pred - symbol_max), m_axis)); | |||||
symbol_loss = symbol_logits; | |||||
} else { | |||||
symbol_middle = opr::reduce_ax_sum(opr::log(symbol_pred), m_axis) / | |||||
opr::GetVarShape::make(symbol_pred, {m_axis}); | |||||
symbol_primary_item = opr::log(symbol_primary_item); | |||||
} | |||||
if (m_label_smooth > .0f) { | |||||
symbol_loss = symbol_loss - m_label_smooth * symbol_middle - | |||||
(1 - m_label_smooth) * symbol_primary_item; | |||||
} else { | |||||
symbol_loss = symbol_loss - symbol_primary_item; | |||||
} | |||||
if (m_reduce_mode == ReduceMode::MEAN) { | |||||
symbol_loss = | |||||
opr::reduce_sum(symbol_loss.flatten(), symbol_loss.make_scalar(1)) / | |||||
(float)(symbol_loss.shape().total_nr_elems()); | |||||
} else if (m_reduce_mode == ReduceMode::SUM) { | |||||
symbol_loss = | |||||
opr::reduce_sum(symbol_loss.flatten(), symbol_loss.make_scalar(1)); | |||||
} | |||||
return symbol_loss; | |||||
} | |||||
MSELoss::MSELoss(ReduceMode reduce_mode) : m_reduce_mode(reduce_mode){}; | |||||
mgb::SymbolVar MSELoss::operator()( | |||||
mgb::SymbolVar symbol_pred, mgb::SymbolVar symol_label) { | |||||
return opr::pow(symbol_pred - symol_label, symbol_pred.make_scalar(2)); | |||||
} | |||||
} // namespace loss | |||||
} // namespace mgb |
@@ -0,0 +1,143 @@ | |||||
/** | |||||
* \file src/opr/impl/training/optimizer.cpp | |||||
* | |||||
* This file is part of MegBrain, a deep learning framework developed by Megvii. | |||||
* | |||||
* \copyright Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
* | |||||
*/ | |||||
#include "megbrain/opr/training/optimizer.h" | |||||
#include "megbrain/exception.h" | |||||
#include "megbrain/opr/training/utils.h" | |||||
namespace mgb { | |||||
namespace optimizer { | |||||
SymbolVarArray Optimizer::make_multiple( | |||||
SymbolVarArray symbol_weights, SymbolVarArray symbol_grads, | |||||
std::shared_ptr<mgb::cg::ComputingGraph> graph) { | |||||
if (symbol_weights.size() != symbol_grads.size()) { | |||||
mgb_throw(AssertionError, "The count of weights differs with that of grads."); | |||||
} | |||||
SymbolVarArray r; | |||||
for (size_t i = 0; i < symbol_weights.size(); i++) { | |||||
r.push_back(make(symbol_weights[i], symbol_grads[i], graph)); | |||||
} | |||||
return r; | |||||
} | |||||
SGD::SGD(float lr, float weight_decay, float momentum) | |||||
: m_lr(lr), m_weight_decay(weight_decay), m_momentum(momentum) { | |||||
if (m_lr <= 0) { | |||||
mgb_throw(AssertionError, "Invalid learning rate: negative value."); | |||||
} | |||||
if (m_weight_decay < 0) { | |||||
mgb_throw(AssertionError, "Invalid weight_decay value: negative value."); | |||||
} | |||||
if (m_momentum < 0) { | |||||
mgb_throw(AssertionError, "Invalid momentum value: negative value."); | |||||
} | |||||
} | |||||
SymbolVar SGD::make( | |||||
SymbolVar symbol_weight, SymbolVar symbol_grad, | |||||
std::shared_ptr<cg::ComputingGraph> graph) { | |||||
SymbolVar symbol_pre_grad; | |||||
auto pre_grad = TensorGen::zeros<dtype::Float32>( | |||||
symbol_grad.shape(), symbol_grad.node()->comp_node()); | |||||
m_pre_grads.push_back(pre_grad); | |||||
symbol_pre_grad = opr::SharedDeviceTensor::make(*graph, *pre_grad); | |||||
if (m_weight_decay != .0f) { | |||||
symbol_grad = symbol_grad + m_weight_decay * symbol_weight; | |||||
} | |||||
if (m_momentum != .0f) { | |||||
symbol_pre_grad = | |||||
opr::AddUpdate::make(symbol_pre_grad, symbol_grad, {m_momentum, 1.0f}); | |||||
return opr::AddUpdate::make(symbol_weight, -symbol_pre_grad, {1.f, m_lr}); | |||||
} else { | |||||
return opr::AddUpdate::make(symbol_weight, -symbol_grad, {1.f, m_lr}); | |||||
} | |||||
} | |||||
Adam::Adam( | |||||
float lr, float weight_decay, std::pair<float, float> betas, float eps, | |||||
bool amsgrad) | |||||
: m_lr(lr), | |||||
m_weight_decay(weight_decay), | |||||
m_betas(betas), | |||||
m_eps(eps), | |||||
m_amsgrad(amsgrad) { | |||||
mgb_assert(m_lr > 0, "Invalid learning rate: negative value."); | |||||
mgb_assert(m_weight_decay >= 0, "Invalid weight_decay value: negative value."); | |||||
mgb_assert( | |||||
m_betas.first >= 0 && m_betas.second >= 0 && m_betas.first < 1 && | |||||
m_betas.second < 1, | |||||
"Invalid betas value: negative value or larger than 1."); | |||||
} | |||||
SymbolVar Adam::make( | |||||
SymbolVar symbol_weight, SymbolVar symbol_grad, | |||||
std::shared_ptr<cg::ComputingGraph> graph) { | |||||
CompNode comp_node = symbol_grad.node()->comp_node(); | |||||
DType dt = symbol_grad.dtype(); | |||||
m_correction1 = TensorGen::ones<dtype::Float32>({1}, comp_node); | |||||
m_correction2 = TensorGen::ones<dtype::Float32>({1}, comp_node); | |||||
std::shared_ptr<DeviceTensorND> exp_avg = | |||||
std::make_shared<DeviceTensorND>(comp_node, symbol_grad.shape(), dt); | |||||
mgb::fill_zero_dev_tensor(*exp_avg); | |||||
std::shared_ptr<DeviceTensorND> exp_avg_sq = | |||||
std::make_shared<DeviceTensorND>(comp_node, symbol_grad.shape(), dt); | |||||
mgb::fill_zero_dev_tensor(*exp_avg_sq); | |||||
m_exp_avg.push_back(exp_avg); | |||||
m_exp_avg_sq.push_back(exp_avg_sq); | |||||
SymbolVar symbol_correction1 = | |||||
opr::SharedDeviceTensor::make(*graph, *m_correction1); | |||||
SymbolVar symbol_correction2 = | |||||
opr::SharedDeviceTensor::make(*graph, *m_correction2); | |||||
SymbolVar symbol_exp_avg = opr::SharedDeviceTensor::make(*graph, exp_avg); | |||||
SymbolVar symbol_exp_avg_sq = opr::SharedDeviceTensor::make(*graph, exp_avg_sq); | |||||
symbol_correction1 = opr::AddUpdate::make( | |||||
symbol_correction1, symbol_correction1, {m_betas.first, .0f}); | |||||
symbol_correction2 = opr::AddUpdate::make( | |||||
symbol_correction2, symbol_correction2, {m_betas.second, .0f}); | |||||
if (m_weight_decay != .0f) { | |||||
symbol_grad = symbol_grad + m_weight_decay * symbol_weight; | |||||
} | |||||
symbol_exp_avg = opr::AddUpdate::make( | |||||
symbol_exp_avg, symbol_grad, {m_betas.first, 1.f - m_betas.first}); | |||||
symbol_exp_avg_sq = opr::AddUpdate::make( | |||||
symbol_exp_avg_sq, symbol_grad * symbol_grad, | |||||
{m_betas.second, 1.f - m_betas.second}); | |||||
SymbolVar delta; | |||||
if (m_amsgrad) { | |||||
std::shared_ptr<DeviceTensorND> max_exp_avg_sq = | |||||
std::make_shared<DeviceTensorND>(comp_node, symbol_grad.shape(), dt); | |||||
mgb::fill_zero_dev_tensor(*max_exp_avg_sq); | |||||
SymbolVar symbol_max_exp_avg_sq = | |||||
opr::SharedDeviceTensor::make(*graph, max_exp_avg_sq); | |||||
symbol_max_exp_avg_sq = opr::AddUpdate::make( | |||||
symbol_exp_avg_sq, opr::max(symbol_max_exp_avg_sq, symbol_exp_avg_sq), | |||||
{1.0f, 1.0f}); | |||||
delta = (symbol_exp_avg / (1.f - symbol_correction1)) / | |||||
(opr::powf(symbol_max_exp_avg_sq / (1.f - symbol_correction2), 0.5f) + | |||||
m_eps); | |||||
} else { | |||||
delta = (symbol_exp_avg / (1.f - symbol_correction1)) / | |||||
(opr::pow( | |||||
symbol_exp_avg_sq / (1.f - symbol_correction2), | |||||
symbol_exp_avg.make_scalar(0.5f)) + | |||||
m_eps); | |||||
} | |||||
return opr::AddUpdate::make(symbol_weight, -delta, {1.0f, m_lr}); | |||||
} | |||||
} // namespace optimizer | |||||
} // namespace mgb |
@@ -0,0 +1,69 @@ | |||||
/** | |||||
* \file src/opr/include/training/dataview.h | |||||
* | |||||
* This file is part of MegBrain, a deep learning framework developed by Megvii. | |||||
* | |||||
* \copyright Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
* | |||||
*/ | |||||
#pragma once | |||||
#include "megbrain/opr/basic_arith_wrapper.h" | |||||
#include "megbrain/opr/tensor_manip.h" | |||||
#include <type_traits> | |||||
namespace mgb { | |||||
using DataPair = std::pair< | |||||
std::shared_ptr<mgb::HostTensorND>, std::shared_ptr<mgb::HostTensorND>>; | |||||
//! The interface of the dataset. | |||||
class IDataView { | |||||
public: | |||||
/*! | |||||
* The method to get an item in dataset with index. | |||||
*/ | |||||
virtual DataPair get_item(int idx) = 0; | |||||
/*! | |||||
* The method to get the size of the dataset. | |||||
*/ | |||||
virtual size_t size() = 0; | |||||
virtual ~IDataView() = default; | |||||
}; | |||||
//! The definition of dataloader, which is corresponding to the <DataLoader> of | |||||
//! Python API of MegEngine. | |||||
class DataLoader { | |||||
public: | |||||
DataLoader( | |||||
std::shared_ptr<IDataView> dataview, mgb::CompNode compnode, | |||||
unsigned long batchsize = 1U, bool shuffle = false, bool drop_last = true); | |||||
/*! | |||||
* Get the next pair of data of the dataset. | |||||
*/ | |||||
DataPair next(); | |||||
/*! | |||||
* Get the size of the dataloader. | |||||
*/ | |||||
size_t size(); | |||||
private: | |||||
std::shared_ptr<IDataView> m_dataview; | |||||
mgb::CompNode m_comp_node; | |||||
unsigned long m_batchsize; | |||||
bool m_shuffle; | |||||
bool m_drop_last; | |||||
size_t m_idx; | |||||
mgb::TensorShape m_data_shape; | |||||
mgb::TensorShape m_label_shape; | |||||
mgb::DType m_data_type; | |||||
mgb::DType m_label_type; | |||||
// Only used in the temp solution for shuffle | |||||
std::vector<int> m_index_collection; | |||||
}; | |||||
} // namespace mgb |
@@ -0,0 +1,70 @@ | |||||
/** | |||||
* \file src/opr/include/training/loss.h | |||||
* | |||||
* This file is part of MegBrain, a deep learning framework developed by Megvii. | |||||
* | |||||
* \copyright Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
* | |||||
*/ | |||||
#pragma once | |||||
#include "megbrain/opr/basic_arith_wrapper.h" | |||||
#include "megbrain/opr/io.h" | |||||
#include "megbrain/opr/tensor_manip.h" | |||||
#include "megbrain/tensor.h" | |||||
namespace mgb { | |||||
namespace loss { | |||||
//! The interface of losses which should be inherited by each loss class. | |||||
class ILoss { | |||||
public: | |||||
/*! | |||||
* The reduce mode of loss to convert output to scalar. | |||||
*/ | |||||
enum ReduceMode { SUM = 0, MEAN = 1 }; | |||||
/*! | |||||
* The calculation of the loss, in which the output is a scalar symbolvar | |||||
*/ | |||||
virtual mgb::SymbolVar operator()( | |||||
mgb::SymbolVar symbol_pred, mgb::SymbolVar symol_label) = 0; | |||||
virtual ~ILoss() = default; | |||||
}; | |||||
/*! | |||||
* The cross entropy loss. The definition could be found here: | |||||
* https://en.wikipedia.org/wiki/Cross_entropy | |||||
* | |||||
* It's corresponding to the <CrossEntropy> of Python API of MegEngine. | |||||
*/ | |||||
class CrossEntropyLoss : public ILoss { | |||||
public: | |||||
CrossEntropyLoss( | |||||
bool with_logits = true, float label_smooth = .0f, | |||||
ReduceMode reduce_mode = ReduceMode::MEAN, int axis = 1); | |||||
mgb::SymbolVar operator()(mgb::SymbolVar symbol_pred, mgb::SymbolVar symol_label); | |||||
protected: | |||||
bool m_with_logits; | |||||
float m_label_smooth; | |||||
ReduceMode m_reduce_mode; | |||||
int m_axis; | |||||
}; | |||||
/*! | |||||
* The MSE(Mean Square Error) loss. The definition could be found here: | |||||
* https://en.wikipedia.org/wiki/Mean_squared_error | |||||
* | |||||
* It's corresponding to the <MSE> of Python API of MegEngine. | |||||
*/ | |||||
class MSELoss : public ILoss { | |||||
public: | |||||
MSELoss(ReduceMode reduce_mode = ReduceMode::MEAN); | |||||
mgb::SymbolVar operator()(mgb::SymbolVar symbol_pred, mgb::SymbolVar symol_label); | |||||
protected: | |||||
ReduceMode m_reduce_mode; | |||||
}; | |||||
} // namespace loss | |||||
} // namespace mgb |
@@ -0,0 +1,135 @@ | |||||
/** | |||||
* \file src/opr/include/training/optimizer.h | |||||
* | |||||
* This file is part of MegBrain, a deep learning framework developed by Megvii. | |||||
* | |||||
* \copyright Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
* | |||||
*/ | |||||
#pragma once | |||||
#include "megbrain/opr/basic_arith_wrapper.h" | |||||
#include "megbrain/opr/io.h" | |||||
#include "megbrain/opr/tensor_manip.h" | |||||
#include "megbrain/tensor.h" | |||||
namespace mgb { | |||||
namespace optimizer { | |||||
//! The interface of optimizers which should be inherited by each optimizer. | |||||
class IOptimizer { | |||||
public: | |||||
/*! | |||||
* The method to add manipulations to the graph to update the weight when the | |||||
* input is SymbolvarArrays. | |||||
*/ | |||||
virtual mgb::SymbolVarArray make_multiple( | |||||
mgb::SymbolVarArray symbol_weights, mgb::SymbolVarArray symbol_grads, | |||||
std::shared_ptr<mgb::cg::ComputingGraph> graph) = 0; | |||||
/*! | |||||
* The method to add manipulations to the graph to update the weight with a | |||||
* certain strategy. | |||||
* The output is expected to be the symbolvar after updating the weight. | |||||
*/ | |||||
virtual mgb::SymbolVar make( | |||||
mgb::SymbolVar symbol_weight, mgb::SymbolVar symbol_grad, | |||||
std::shared_ptr<mgb::cg::ComputingGraph> graph) = 0; | |||||
virtual ~IOptimizer() = default; | |||||
}; | |||||
/*! | |||||
* An abstract class which helps to simplify the implemention of optimizers. | |||||
* It gives a default implemention of method <make_multiple> based on the method | |||||
* <make> defined by its derived class. | |||||
*/ | |||||
class Optimizer : public IOptimizer { | |||||
public: | |||||
mgb::SymbolVarArray make_multiple( | |||||
mgb::SymbolVarArray symbol_weights, mgb::SymbolVarArray symbol_grads, | |||||
std::shared_ptr<mgb::cg::ComputingGraph> graph); | |||||
virtual mgb::SymbolVar make( | |||||
mgb::SymbolVar symbol_weight, mgb::SymbolVar symbol_grad, | |||||
std::shared_ptr<mgb::cg::ComputingGraph> graph) = 0; | |||||
virtual ~Optimizer() = default; | |||||
}; | |||||
/*! | |||||
* The SGD(Stochastic gradient descent) optimizer. | |||||
* The definition could be found here: | |||||
* https://en.wikipedia.org/wiki/Stochastic_gradient_descent | |||||
* It is corresponding to the <SGD> of Python API of MegEngine. | |||||
*/ | |||||
class SGD : public Optimizer { | |||||
public: | |||||
SGD() = default; | |||||
SGD(float lr, float weight_decay = .0f, float momentum = .0f); | |||||
SGD(const SGD& that) { | |||||
m_lr = that.m_lr; | |||||
m_momentum = that.m_momentum; | |||||
m_weight_decay = that.m_weight_decay; | |||||
} | |||||
mgb::SymbolVar make( | |||||
mgb::SymbolVar symbol_weight, mgb::SymbolVar symbol_grad, | |||||
std::shared_ptr<mgb::cg::ComputingGraph> graph); | |||||
const SGD& operator=(const SGD& that) { | |||||
m_lr = that.m_lr; | |||||
m_momentum = that.m_momentum; | |||||
m_weight_decay = that.m_weight_decay; | |||||
return *this; | |||||
} | |||||
protected: | |||||
float m_lr; | |||||
float m_weight_decay; | |||||
float m_momentum; | |||||
std::vector<std::shared_ptr<mgb::HostTensorND>> m_pre_grads; | |||||
}; | |||||
/*! | |||||
* The Adam optimizer. The definition could be found here: | |||||
* https://en.wikipedia.org/wiki/Stochastic_gradient_descent#:~:text=full%2Dbatches.%5B26%5D-,Adam,-%5Bedit%5D | |||||
* It is corresponding to the <Adam> of Python API of MegEngine. | |||||
*/ | |||||
class Adam : public Optimizer { | |||||
public: | |||||
Adam() = default; | |||||
Adam(float lr, float weight_decay = .0f, | |||||
std::pair<float, float> betas = {0.9f, 0.999f}, float eps = 1e-8f, | |||||
bool amsgrad = false); | |||||
Adam(const Adam& that) { | |||||
m_lr = that.m_lr; | |||||
m_betas = that.m_betas; | |||||
m_eps = that.m_eps; | |||||
m_weight_decay = that.m_weight_decay; | |||||
m_amsgrad = that.m_amsgrad; | |||||
} | |||||
mgb::SymbolVar make( | |||||
mgb::SymbolVar symbol_weight, mgb::SymbolVar symbol_grad, | |||||
std::shared_ptr<mgb::cg::ComputingGraph> graph); | |||||
const Adam& operator=(const Adam& that) { | |||||
m_lr = that.m_lr; | |||||
m_betas = that.m_betas; | |||||
m_eps = that.m_eps; | |||||
m_weight_decay = that.m_weight_decay; | |||||
m_amsgrad = that.m_amsgrad; | |||||
return *this; | |||||
} | |||||
protected: | |||||
float m_lr; | |||||
float m_weight_decay; | |||||
std::pair<float, float> m_betas; | |||||
float m_eps; | |||||
bool m_amsgrad; | |||||
std::vector<std::shared_ptr<mgb::DeviceTensorND>> m_exp_avg; | |||||
std::vector<std::shared_ptr<mgb::DeviceTensorND>> m_exp_avg_sq; | |||||
std::vector<std::shared_ptr<mgb::DeviceTensorND>> m_max_exp_avg_sq; | |||||
std::shared_ptr<mgb::HostTensorND> m_correction1; | |||||
std::shared_ptr<mgb::HostTensorND> m_correction2; | |||||
}; | |||||
} // namespace optimizer | |||||
} // namespace mgb |
@@ -0,0 +1,81 @@ | |||||
/** | |||||
* \file src/opr/include/tensor_gen.h | |||||
* | |||||
* This file is part of MegBrain, a deep learning framework developed by Megvii. | |||||
* | |||||
* \copyright Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
* | |||||
*/ | |||||
#pragma once | |||||
#include "megbrain/tensor.h" | |||||
namespace mgb { | |||||
/*! | |||||
* A static class including methods to generate host tensors. | |||||
*/ | |||||
class TensorGen { | |||||
public: | |||||
/*! | |||||
* \brief Generate a tensor with all the elements equal to the given value | |||||
*/ | |||||
template <typename ctype, typename = typename mgb::ctype_enable_if<ctype>::type> | |||||
static std::shared_ptr<mgb::HostTensorND> constant( | |||||
mgb::TensorShape shape, ctype value, | |||||
mgb::CompNode comp_node = mgb::CompNode::load("xpu0")) { | |||||
std::shared_ptr<mgb::HostTensorND> r = std::make_shared<mgb::HostTensorND>( | |||||
comp_node, shape, typename mgb::DTypeTrait<ctype>::dtype()); | |||||
auto ptr = r->ptr<ctype>(); | |||||
for (size_t i = 0, it = r->layout().total_nr_elems(); i < it; i++) { | |||||
ptr[i] = value; | |||||
} | |||||
return r; | |||||
} | |||||
/*! | |||||
* \brief Generate a tensor with all the elements equal to 0 | |||||
*/ | |||||
template <typename T> | |||||
static std::shared_ptr<mgb::HostTensorND> zeros( | |||||
mgb::TensorShape shape, | |||||
mgb::CompNode comp_node = mgb::CompNode::load("xpu0")) { | |||||
static_assert( | |||||
std::is_base_of<mgb::DType, T>(), | |||||
"Please use the dtype in namespace mgb or use " | |||||
"Tensor::constant."); | |||||
using ctype = typename mgb::DTypeTrait<T>::ctype; | |||||
return constant(shape, (ctype)0, comp_node); | |||||
} | |||||
/*! | |||||
* \brief Generate a tensor with all the elements equal to 0. In this method | |||||
* typename is not required. | |||||
*/ | |||||
static std::shared_ptr<mgb::HostTensorND> zeros( | |||||
mgb::TensorShape shape, mgb::DType dtype = mgb::dtype::Float32(), | |||||
mgb::CompNode comp_node = mgb::CompNode::load("xpu0")) { | |||||
std::shared_ptr<mgb::HostTensorND> r = | |||||
std::make_shared<mgb::HostTensorND>(comp_node, shape, dtype); | |||||
auto ptr = r->raw_ptr(); | |||||
memset(ptr, 0, sizeof(megdnn::dt_byte)); | |||||
return r; | |||||
} | |||||
/*! | |||||
* \brief Generate a tensor with all the elements equal to 1 | |||||
*/ | |||||
template <typename T> | |||||
static std::shared_ptr<mgb::HostTensorND> ones( | |||||
mgb::TensorShape shape, | |||||
mgb::CompNode comp_node = mgb::CompNode::load("xpu0")) { | |||||
static_assert( | |||||
std::is_base_of<mgb::DType, T>(), | |||||
"Please use the dtype in namespace mgb or use " | |||||
"Tensor::constant."); | |||||
using ctype = typename mgb::DTypeTrait<T>::ctype; | |||||
return constant(shape, (ctype)1, comp_node); | |||||
} | |||||
}; | |||||
} // namespace mgb |
@@ -0,0 +1,106 @@ | |||||
/** | |||||
* \file src/opr/test/training/loss.cpp | |||||
* | |||||
* This file is part of MegBrain, a deep learning framework developed by Megvii. | |||||
* | |||||
* \copyright Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
* | |||||
*/ | |||||
#include "megbrain/opr/basic_arith_wrapper.h" | |||||
#include "megbrain/opr/indexing.h" | |||||
#include "megbrain/opr/io.h" | |||||
#include "megbrain/opr/tensor_manip.h" | |||||
#include "megbrain/tensor.h" | |||||
#include "megbrain/test/helper.h" | |||||
#include "megbrain/opr/training/loss.h" | |||||
using namespace mgb; | |||||
using namespace loss; | |||||
namespace { | |||||
class Device2HostCallback { | |||||
public: | |||||
Device2HostCallback(std::shared_ptr<HostTensorND> host) : m_host{host} {} | |||||
void operator()(const DeviceTensorND& device) { m_host->copy_from(device).sync(); } | |||||
private: | |||||
std::shared_ptr<HostTensorND> m_host; | |||||
}; | |||||
class CrossEntropyTest : public ::testing::Test { | |||||
private: | |||||
/* data */ | |||||
std::shared_ptr<HostTensorND> pred, label, truth, loss; | |||||
TensorShape pred_shape = {2, 10}; | |||||
TensorShape label_shape = {2}; | |||||
TensorShape truth_shape = {1}; | |||||
std::vector<float> pred_values = { | |||||
-0.22847f, -0.65020f, -0.42470f, 1.32903f, -0.58377f, -0.15881f, -0.23134f, | |||||
-0.36147f, -1.05848f, -0.23285f, 0.32360f, -0.36430f, -0.03172f, 1.18970f, | |||||
-0.23465f, -0.16139f, -0.22942f, -0.22538f, -0.68029f, -0.41004f}; | |||||
std::vector<int> label_values = {5, 3}; | |||||
std::vector<float> truth_values = {1.8120441}; | |||||
CompNode node = CompNode::load("cpu0"); | |||||
std::shared_ptr<cg::ComputingGraph> graph; | |||||
CrossEntropyLoss cross_entropy_loss; | |||||
public: | |||||
std::unique_ptr<cg::AsyncExecutable> func; | |||||
void setup(); | |||||
void build_model(float label_smooth = .0f); | |||||
void verify(); | |||||
template <typename T> | |||||
void assign_value(std::shared_ptr<HostTensorND> tensor, std::vector<T> value); | |||||
}; | |||||
} // namespace | |||||
void CrossEntropyTest::setup() { | |||||
pred = std::make_shared<HostTensorND>(node, pred_shape, dtype::Float32()); | |||||
label = std::make_shared<HostTensorND>(node, label_shape, dtype::Int32()); | |||||
truth = std::make_shared<HostTensorND>(node, truth_shape, dtype::Float32()); | |||||
loss = std::make_shared<HostTensorND>(node, truth_shape, dtype::Float32()); | |||||
assign_value<float>(pred, pred_values); | |||||
assign_value<int>(label, label_values); | |||||
assign_value<float>(truth, truth_values); | |||||
} | |||||
template <typename T> | |||||
void CrossEntropyTest::assign_value( | |||||
std::shared_ptr<HostTensorND> tensor, std::vector<T> values) { | |||||
ASSERT_EQ(values.size(), tensor->shape().total_nr_elems()); | |||||
auto ptr = tensor->ptr<T>(); | |||||
for (size_t i = 0, it = tensor->shape().total_nr_elems(); i < it; i += 1) { | |||||
ptr[i] = values.at(i); | |||||
} | |||||
} | |||||
void CrossEntropyTest::build_model(float label_smooth) { | |||||
graph = cg::ComputingGraph::make(); | |||||
SymbolVar symbol_pred = opr::SharedDeviceTensor::make(*graph, *pred); | |||||
SymbolVar symbol_label = opr::SharedDeviceTensor::make(*graph, *label); | |||||
SymbolVar symbol_loss = cross_entropy_loss(symbol_pred, symbol_label); | |||||
cg::ComputingGraph::OutputSpec spec; | |||||
spec.push_back({symbol_loss, Device2HostCallback(loss)}); | |||||
func = graph->compile(spec); | |||||
} | |||||
void CrossEntropyTest::verify() { | |||||
func->execute().wait(); | |||||
ASSERT_NEAR(loss->ptr<float>()[0], truth->ptr<float>()[0], 0.001f); | |||||
} | |||||
TEST_F(CrossEntropyTest, CrossEntropy) { | |||||
setup(); | |||||
build_model(); | |||||
verify(); | |||||
} |
@@ -0,0 +1,98 @@ | |||||
/** | |||||
* \file src/opr/test/training/optimizer.cpp | |||||
* | |||||
* This file is part of MegBrain, a deep learning framework developed by Megvii. | |||||
* | |||||
* \copyright Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
* | |||||
*/ | |||||
#include "megbrain/opr/basic_arith_wrapper.h" | |||||
#include "megbrain/opr/indexing.h" | |||||
#include "megbrain/opr/io.h" | |||||
#include "megbrain/opr/tensor_manip.h" | |||||
#include "megbrain/tensor.h" | |||||
#include "megbrain/test/helper.h" | |||||
#include "megbrain/opr/training/optimizer.h" | |||||
#include "megbrain/opr/training/utils.h" | |||||
using namespace mgb; | |||||
using namespace optimizer; | |||||
namespace { | |||||
class Device2HostCallback { | |||||
public: | |||||
Device2HostCallback(std::shared_ptr<HostTensorND> host) : m_host{host} {} | |||||
void operator()(const DeviceTensorND& device) { m_host->copy_from(device).sync(); } | |||||
private: | |||||
std::shared_ptr<HostTensorND> m_host; | |||||
}; | |||||
template <typename T> | |||||
void assign_value(std::shared_ptr<HostTensorND>& tensor, std::vector<T>& values) { | |||||
ASSERT_EQ(values.size(), tensor->layout().total_nr_elems()); | |||||
auto ptr = tensor->ptr<T>(); | |||||
for (size_t i = 0, it = tensor->layout().total_nr_elems(); i < it; i += 1) { | |||||
ptr[i] = values.at(i); | |||||
} | |||||
} | |||||
class OptimizerTest : public ::testing::Test { | |||||
public: | |||||
void verify( | |||||
std::shared_ptr<IOptimizer> optimizer, std::shared_ptr<HostTensorND> weight, | |||||
std::shared_ptr<HostTensorND> grad, std::shared_ptr<HostTensorND> truth, | |||||
int execute_times); | |||||
protected: | |||||
std::shared_ptr<IOptimizer> optimizer; | |||||
std::shared_ptr<cg::ComputingGraph> graph; | |||||
}; | |||||
void OptimizerTest::verify( | |||||
std::shared_ptr<IOptimizer> optimizer, std::shared_ptr<HostTensorND> weight, | |||||
std::shared_ptr<HostTensorND> grad, std::shared_ptr<HostTensorND> truth, | |||||
int execute_times) { | |||||
graph = cg::ComputingGraph::make(); | |||||
SymbolVar symbol_weight = opr::SharedDeviceTensor::make(*graph, *weight); | |||||
SymbolVar symbol_grad = opr::SharedDeviceTensor::make(*graph, *grad); | |||||
cg::ComputingGraph::OutputSpec spec; | |||||
spec.push_back( | |||||
{optimizer->make(symbol_weight, symbol_grad, graph), | |||||
Device2HostCallback(weight)}); | |||||
auto func = graph->compile(spec); | |||||
for (int i = 0; i < execute_times; i++) { | |||||
func->execute(); | |||||
} | |||||
auto weight_ptr = weight->ptr<float>(); | |||||
auto truth_ptr = truth->ptr<float>(); | |||||
for (size_t i = 0, it = weight->shape().total_nr_elems(); i < it; i += 1) { | |||||
ASSERT_NEAR(weight_ptr[i], truth_ptr[i], 0.001f); | |||||
} | |||||
} | |||||
} // namespace | |||||
TEST_F(OptimizerTest, SGD) { | |||||
auto weight = TensorGen::constant({1}, 0.30542f); | |||||
auto grad = TensorGen::constant({1}, -1.81453f); | |||||
auto truth = TensorGen::constant({1}, 1.04673f); | |||||
int execute_times = 10; | |||||
std::shared_ptr<SGD> sgd = std::make_shared<SGD>(0.01f, 5e-2f, 0.9f); | |||||
verify(sgd, weight, grad, truth, execute_times); | |||||
} | |||||
TEST_F(OptimizerTest, AdamTest) { | |||||
auto weight = TensorGen::constant({1}, 1.62957f); | |||||
auto grad = TensorGen::constant({1}, 1.02605f); | |||||
auto truth = TensorGen::constant({1}, 1.52969f); | |||||
int execute_times = 10; | |||||
std::shared_ptr<Adam> adam = std::make_shared<Adam>(0.01f, 0.9f); | |||||
verify(adam, weight, grad, truth, execute_times); | |||||
} |