fwd only
GitOrigin-RevId: 989474168d
HuaHua404-patch-1
@@ -1475,6 +1475,35 @@ protected: | |||||
using LAMB = LAMBUpdate; | using LAMB = LAMBUpdate; | ||||
class NormBase : public OperatorBase { | |||||
DEF_OPR_PARAM(Norm); // package norm params in Norm keyword from py declaration | |||||
DEF_OPR_IMPL(NormBase, OperatorBase, 1, 1); // constructor and static members | |||||
public: | |||||
virtual void deduce_layout(const TensorLayout& src, TensorLayout& dst) = 0; | |||||
virtual size_t get_workspace_in_bytes( | |||||
const TensorLayout& src, const TensorLayout& dst) = 0; | |||||
protected: | |||||
void check_exec( | |||||
const TensorLayout& src, const TensorLayout& dst, | |||||
size_t workspace_in_bytes); | |||||
}; | |||||
class NormForward : public NormBase { | |||||
DEF_OPR_IMPL(NormForward, NormBase, 1, 1); | |||||
using Mode = Param::Mode; | |||||
public: | |||||
virtual void exec( | |||||
_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||||
_megdnn_workspace workspace) = 0; | |||||
virtual void deduce_layout(const TensorLayout& src, TensorLayout& dst); | |||||
virtual size_t get_workspace_in_bytes( | |||||
const TensorLayout& src, const TensorLayout& dst) = 0; | |||||
}; | |||||
using Norm = NormForward; | |||||
} // namespace megdnn | } // namespace megdnn | ||||
#include "megdnn/internal/opr_header_epilogue.h" | #include "megdnn/internal/opr_header_epilogue.h" | ||||
@@ -1277,3 +1277,11 @@ PADDING_MODES = [Doc('REPLICATE = 0', 'aaaaaa|abcdefgh|hhhhhhh'), | |||||
add_fields('bool', Doc('bias_correction', 'whether correct bias'), 'true'). | add_fields('bool', Doc('bias_correction', 'whether correct bias'), 'true'). | ||||
add_fields('bool', Doc('always_adapt', 'apply adaptive lr to 0.0'), 'false') | add_fields('bool', Doc('always_adapt', 'apply adaptive lr to 0.0'), 'false') | ||||
) | ) | ||||
(pdef("Norm"). | |||||
add_enum('Mode', | |||||
Doc('P_NORM=0', 'calculate p-norm, parameter p would be ignored in other mode'), | |||||
Doc('INF_NORM=1', 'infinite norm'), | |||||
Doc('NEG_INF_NORM=2', 'negative infinite norm'), name_field="mode"). | |||||
add_fields('float32', Doc('p', 'the order of norm'), '2'). | |||||
add_fields('int32', Doc('dim', 'which dim the norm performed along'), '-1'), | |||||
) |
@@ -212,7 +212,8 @@ private: | |||||
cb(LAMBUpdate) \ | cb(LAMBUpdate) \ | ||||
cb(LSTMBackward) \ | cb(LSTMBackward) \ | ||||
cb(SoftmaxForward) \ | cb(SoftmaxForward) \ | ||||
cb(SoftmaxBackward) | |||||
cb(SoftmaxBackward) \ | |||||
cb(NormForward) | |||||
// clang-format on | // clang-format on | ||||
/*! | /*! | ||||
@@ -0,0 +1,43 @@ | |||||
#include "megdnn/oprs.h" | |||||
#include "src/common/utils.h" | |||||
namespace megdnn { | |||||
void NormForward::deduce_layout(const TensorLayout& src, TensorLayout& dst) { | |||||
megdnn_assert( | |||||
param().dim > -1 && param().dim < static_cast<dt_int32>(src.ndim), | |||||
"dim params must be passed and cannot be -1."); | |||||
SmallVector<size_t> shapeList; | |||||
for (size_t i = 0; i < src.ndim; ++i) { | |||||
if (static_cast<dt_int32>(i) != param().dim) { | |||||
shapeList.append(1, static_cast<size_t>(src.shape[i])); | |||||
} else { | |||||
shapeList.append(1, static_cast<size_t>(1)); | |||||
} | |||||
} | |||||
dst = TensorLayout{TensorShape(shapeList), src.dtype}; | |||||
return; | |||||
} | |||||
void NormBase::check_exec( | |||||
const TensorLayout& src, const TensorLayout& dst, size_t workspace_in_bytes) { | |||||
megdnn_assert_eq_dtype(src, dst); | |||||
#if !MEGDNN_DISABLE_FLOAT16 | |||||
megdnn_assert( | |||||
src.dtype.enumv() == DTypeEnum::Float16 || | |||||
src.dtype.enumv() == DTypeEnum::Float32, | |||||
"Float16 or Float32 is only supported."); | |||||
#else | |||||
megdnn_assert( | |||||
src.dtype.enumv() == DTypeEnum::Float32, "Float32 is only supported."); | |||||
#endif | |||||
TensorLayout dst_expected; | |||||
deduce_layout(src, dst_expected); | |||||
megdnn_assert_eq_layout(dst_expected, dst); | |||||
auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); | |||||
megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); | |||||
} | |||||
} // namespace megdnn |
@@ -16,6 +16,7 @@ struct OprTrait {}; | |||||
static const bool can_deduce_layout = CanDeduceLayout; \ | static const bool can_deduce_layout = CanDeduceLayout; \ | ||||
} | } | ||||
DEF(Norm, 2, true, true); | |||||
DEF(Padding, 2, false, true); | DEF(Padding, 2, false, true); | ||||
DEF(PaddingBackward, 2, false, false); | DEF(PaddingBackward, 2, false, false); | ||||
DEF(ConvolutionForward, 3, true, true); | DEF(ConvolutionForward, 3, true, true); | ||||
@@ -47,6 +47,7 @@ | |||||
#include "src/cuda/matrix_mul/opr_impl.h" | #include "src/cuda/matrix_mul/opr_impl.h" | ||||
#include "src/cuda/max_tensor_diff/opr_impl.h" | #include "src/cuda/max_tensor_diff/opr_impl.h" | ||||
#include "src/cuda/mesh_indexing/opr_impl.h" | #include "src/cuda/mesh_indexing/opr_impl.h" | ||||
#include "src/cuda/norm/opr_impl.h" | |||||
#include "src/cuda/padding/opr_impl.h" | #include "src/cuda/padding/opr_impl.h" | ||||
#include "src/cuda/param_pack/opr_impl.h" | #include "src/cuda/param_pack/opr_impl.h" | ||||
#include "src/cuda/pooling/opr_impl.h" | #include "src/cuda/pooling/opr_impl.h" | ||||
@@ -216,6 +217,7 @@ MEGDNN_SPECIALIZE_CREATE_OPERATOR(DropoutForward); | |||||
MEGDNN_SPECIALIZE_CREATE_OPERATOR(DropoutBackward); | MEGDNN_SPECIALIZE_CREATE_OPERATOR(DropoutBackward); | ||||
MEGDNN_SPECIALIZE_CREATE_OPERATOR(SoftmaxForward); | MEGDNN_SPECIALIZE_CREATE_OPERATOR(SoftmaxForward); | ||||
MEGDNN_SPECIALIZE_CREATE_OPERATOR(SoftmaxBackward); | MEGDNN_SPECIALIZE_CREATE_OPERATOR(SoftmaxBackward); | ||||
MEGDNN_SPECIALIZE_CREATE_OPERATOR(NormForward); | |||||
template <typename Opr> | template <typename Opr> | ||||
std::unique_ptr<Opr> HandleImpl::create_operator() { | std::unique_ptr<Opr> HandleImpl::create_operator() { | ||||
@@ -0,0 +1,28 @@ | |||||
#include "helper.h" | |||||
#include "megdnn/dtype.h" | |||||
#include "src/cuda/reduce_helper.cuh" | |||||
namespace megdnn { | |||||
namespace cuda { | |||||
using namespace device_reduce; | |||||
#define COMMA , | |||||
INST_REDUCE(NormOp<dt_float32 COMMA dt_float32 COMMA dt_float32>, false); | |||||
INST_REDUCE(NormOp<dt_float16 COMMA dt_float16 COMMA dt_float16>, false); | |||||
INST_REDUCE(NormZeroOp<dt_float32 COMMA dt_float32 COMMA dt_float32>, false); | |||||
INST_REDUCE(NormZeroOp<dt_float16 COMMA dt_float16 COMMA dt_float16>, false); | |||||
INST_REDUCE(NormOneOp<dt_float32 COMMA dt_float32 COMMA dt_float32>, false); | |||||
INST_REDUCE(NormOneOp<dt_float16 COMMA dt_float16 COMMA dt_float16>, false); | |||||
INST_REDUCE(NormTwoOp<dt_float32 COMMA dt_float32 COMMA dt_float32>, false); | |||||
INST_REDUCE(NormTwoOp<dt_float16 COMMA dt_float16 COMMA dt_float16>, false); | |||||
#undef COMMA | |||||
} // namespace cuda | |||||
} // namespace megdnn |
@@ -0,0 +1,226 @@ | |||||
#pragma once | |||||
#include "megdnn/dtype.h" | |||||
#if MEGDNN_CC_HOST | |||||
#include "megdnn/basic_types.h" | |||||
#endif | |||||
namespace megdnn { | |||||
namespace device_reduce { | |||||
template <typename src_ctype, typename dst_ctype, typename wtype_> | |||||
struct NormOp; | |||||
template <> | |||||
struct NormOp<dt_float32, dt_float32, dt_float32> { | |||||
typedef dt_float32 wtype; | |||||
typedef dt_float32 src_ctype; | |||||
typedef dt_float32 dst_ctype; | |||||
typedef wtype p_type; | |||||
const wtype INIT; | |||||
src_ctype* src; | |||||
dst_ctype* dst; | |||||
const size_t B; | |||||
const p_type p; | |||||
MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { | |||||
return powf(fabsf(src[idx]), p); | |||||
} | |||||
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { | |||||
dst[idx] = powf(val, 1.f / p); | |||||
} | |||||
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { | |||||
return lhs + rhs; | |||||
} | |||||
MEGDNN_HOST MEGDNN_DEVICE NormOp(src_ctype* src, dst_ctype* dst, size_t B, p_type p) | |||||
: INIT(wtype(0)), src(src), dst(dst), B(B), p(static_cast<wtype>(p)) {} | |||||
}; | |||||
#if !MEGDNN_DISABLE_FLOAT16 | |||||
template <> | |||||
struct NormOp<dt_float16, dt_float16, dt_float16> { | |||||
typedef dt_float16 wtype; | |||||
typedef dt_float16 src_ctype; | |||||
typedef dt_float16 dst_ctype; | |||||
const wtype INIT; | |||||
src_ctype* src; | |||||
dst_ctype* dst; | |||||
const size_t B; | |||||
const wtype p; | |||||
// HALF_FLOAT API has dispatch host and device. | |||||
MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { | |||||
return half_float::detail::pow(half_float::detail::abs(src[idx]), p); | |||||
} | |||||
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { | |||||
dst[idx] = half_float::detail::pow(val, static_cast<wtype>(1.f) / p); | |||||
} | |||||
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { | |||||
return lhs + rhs; | |||||
} | |||||
MEGDNN_HOST MEGDNN_DEVICE | |||||
NormOp(src_ctype* src, dst_ctype* dst, size_t B, dt_float32 p) | |||||
: INIT(wtype(0)), src(src), dst(dst), B(B), p(static_cast<wtype>(p)) {} | |||||
}; | |||||
#endif | |||||
// TODO: 0Norm impl need understand reduceop | |||||
template <typename src_ctype, typename dst_ctype, typename wtype_> | |||||
struct NormZeroOp; | |||||
template <> | |||||
struct NormZeroOp<dt_float32, dt_float32, dt_float32> { | |||||
typedef dt_float32 wtype; | |||||
typedef dt_float32 src_ctype; | |||||
typedef dt_float32 dst_ctype; | |||||
const wtype INIT; | |||||
src_ctype* src; | |||||
dst_ctype* dst; | |||||
const size_t B; | |||||
const wtype epsilon = 0.00001f; | |||||
MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { | |||||
return fabsf(src[idx] - 0.0f) <= epsilon ? 0.0f : 1.0f; | |||||
} | |||||
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; } | |||||
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { | |||||
return lhs + rhs; | |||||
} | |||||
MEGDNN_HOST MEGDNN_DEVICE NormZeroOp(src_ctype* src, dst_ctype* dst, size_t B) | |||||
: INIT(wtype(0)), src(src), dst(dst), B(B) {} | |||||
}; | |||||
#if !MEGDNN_DISABLE_FLOAT16 | |||||
template <> | |||||
struct NormZeroOp<dt_float16, dt_float16, dt_float16> { | |||||
typedef dt_float16 wtype; | |||||
typedef dt_float16 src_ctype; | |||||
typedef dt_float16 dst_ctype; | |||||
const wtype INIT; | |||||
src_ctype* src; | |||||
dst_ctype* dst; | |||||
const size_t B; | |||||
const wtype epsilon = half_float::half(0.00001f); | |||||
MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { | |||||
return half_float::detail::fabs(src[idx] - half_float::half()) <= epsilon | |||||
? half_float::half(0.0f) | |||||
: half_float::half(1.0f); | |||||
} | |||||
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; } | |||||
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { | |||||
return lhs + rhs; | |||||
} | |||||
MEGDNN_HOST MEGDNN_DEVICE NormZeroOp(src_ctype* src, dst_ctype* dst, size_t B) | |||||
: INIT(wtype(0)), src(src), dst(dst), B(B) {} | |||||
}; | |||||
#endif | |||||
template <typename src_ctype, typename dst_ctype, typename wtype_> | |||||
struct NormOneOp; | |||||
template <> | |||||
struct NormOneOp<dt_float32, dt_float32, dt_float32> { | |||||
typedef dt_float32 wtype; | |||||
typedef dt_float32 src_ctype; | |||||
typedef dt_float32 dst_ctype; | |||||
const wtype INIT; | |||||
src_ctype* src; | |||||
dst_ctype* dst; | |||||
const size_t B; | |||||
MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return fabsf(src[idx]); } | |||||
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; } | |||||
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { | |||||
return lhs + rhs; | |||||
} | |||||
MEGDNN_HOST MEGDNN_DEVICE NormOneOp(src_ctype* src, dst_ctype* dst, size_t B) | |||||
: INIT(wtype(0)), src(src), dst(dst), B(B) {} | |||||
}; | |||||
#if !MEGDNN_DISABLE_FLOAT16 | |||||
template <> | |||||
struct NormOneOp<dt_float16, dt_float16, dt_float16> { | |||||
typedef dt_float16 wtype; | |||||
typedef dt_float16 src_ctype; | |||||
typedef dt_float16 dst_ctype; | |||||
const wtype INIT; | |||||
src_ctype* src; | |||||
dst_ctype* dst; | |||||
const size_t B; | |||||
MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { | |||||
return half_float::detail::abs(src[idx]); | |||||
} | |||||
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; } | |||||
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { | |||||
return lhs + rhs; | |||||
} | |||||
MEGDNN_HOST MEGDNN_DEVICE NormOneOp(src_ctype* src, dst_ctype* dst, size_t B) | |||||
: INIT(wtype(0)), src(src), dst(dst), B(B) {} | |||||
}; | |||||
#endif | |||||
template <typename src_ctype, typename dst_ctype, typename wtype_> | |||||
struct NormTwoOp; | |||||
template <> | |||||
struct NormTwoOp<dt_float32, dt_float32, dt_float32> { | |||||
typedef dt_float32 wtype; | |||||
typedef dt_float32 src_ctype; | |||||
typedef dt_float32 dst_ctype; | |||||
const wtype INIT; | |||||
src_ctype* src; | |||||
dst_ctype* dst; | |||||
const size_t B; | |||||
MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx] * src[idx]; } | |||||
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { | |||||
dst[idx] = sqrtf(val); | |||||
} | |||||
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { | |||||
return lhs + rhs; | |||||
} | |||||
MEGDNN_HOST MEGDNN_DEVICE NormTwoOp(src_ctype* src, dst_ctype* dst, size_t B) | |||||
: INIT(wtype(0)), src(src), dst(dst), B(B) {} | |||||
}; | |||||
#if !MEGDNN_DISABLE_FLOAT16 | |||||
template <> | |||||
struct NormTwoOp<dt_float16, dt_float16, dt_float16> { | |||||
typedef dt_float16 wtype; | |||||
typedef dt_float16 src_ctype; | |||||
typedef dt_float16 dst_ctype; | |||||
const wtype INIT; | |||||
src_ctype* src; | |||||
dst_ctype* dst; | |||||
const size_t B; | |||||
MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx] * src[idx]; } | |||||
MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { | |||||
dst[idx] = half_float::detail::sqrt(val); | |||||
} | |||||
static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { | |||||
return lhs + rhs; | |||||
} | |||||
MEGDNN_HOST MEGDNN_DEVICE NormTwoOp(src_ctype* src, dst_ctype* dst, size_t B) | |||||
: INIT(wtype(0)), src(src), dst(dst), B(B) {} | |||||
}; | |||||
#endif | |||||
} // namespace device_reduce | |||||
} // namespace megdnn |
@@ -0,0 +1,180 @@ | |||||
#include "src/cuda/norm/opr_impl.h" | |||||
#include "helper.h" | |||||
#include "src/common/reduce_helper_device.h" | |||||
#include "src/common/utils.h" | |||||
#include "src/cuda/handle.h" | |||||
#include "src/cuda/reduce_helper.cuh" | |||||
#include "src/cuda/utils.h" | |||||
namespace megdnn { | |||||
namespace cuda { | |||||
using namespace device_reduce; | |||||
using Mode = Norm::Mode; | |||||
template <> | |||||
void NormForwardImpl::dispatch_mode<Mode::NEG_INF_NORM>( | |||||
_megdnn_tensor_inout src, _megdnn_tensor_inout dst, _megdnn_workspace workspace, | |||||
size_t A, size_t B, size_t C, cudaStream_t stream) { | |||||
#define CASE(dt) \ | |||||
case DTypeTrait<dt>::enumv: { \ | |||||
using ctype = DTypeTrait<dt>::ctype; \ | |||||
auto reduceOp = \ | |||||
MinOp<ctype, ctype, ctype>(src.ptr<ctype>(), dst.ptr<ctype>(), B); \ | |||||
run_reduce<MinOp<ctype, ctype, ctype>, false>( \ | |||||
workspace.ptr<ctype>(), A, B, C, stream, reduceOp); \ | |||||
break; \ | |||||
}; | |||||
switch (src.layout.dtype.enumv()) { | |||||
CASE(::megdnn::dtype::Float32) | |||||
#if !MEGDNN_DISABLE_FLOAT16 | |||||
CASE(::megdnn::dtype::Float16) | |||||
#endif | |||||
default: | |||||
megdnn_assert_internal(false); | |||||
} | |||||
#undef CASE | |||||
} | |||||
template <> | |||||
void NormForwardImpl::dispatch_mode<Mode::INF_NORM>( | |||||
_megdnn_tensor_inout src, _megdnn_tensor_inout dst, _megdnn_workspace workspace, | |||||
size_t A, size_t B, size_t C, cudaStream_t stream) { | |||||
#define CASE(dt) \ | |||||
case DTypeTrait<dt>::enumv: { \ | |||||
using ctype = DTypeTrait<dt>::ctype; \ | |||||
auto reduceOp = \ | |||||
MaxOp<ctype, ctype, ctype>(src.ptr<ctype>(), dst.ptr<ctype>(), B); \ | |||||
run_reduce<MaxOp<ctype, ctype, ctype>, false>( \ | |||||
workspace.ptr<ctype>(), A, B, C, stream, reduceOp); \ | |||||
break; \ | |||||
}; | |||||
switch (src.layout.dtype.enumv()) { | |||||
CASE(::megdnn::dtype::Float32) | |||||
#if !MEGDNN_DISABLE_FLOAT16 | |||||
CASE(::megdnn::dtype::Float16) | |||||
#endif | |||||
default: | |||||
megdnn_assert_internal(false); | |||||
} | |||||
#undef CASE | |||||
} | |||||
template <> | |||||
void NormForwardImpl::dispatch_mode<Mode::P_NORM>( | |||||
_megdnn_tensor_inout src, _megdnn_tensor_inout dst, _megdnn_workspace workspace, | |||||
size_t A, size_t B, size_t C, cudaStream_t stream) { | |||||
typedef dt_float32 p_type; | |||||
#define CASE(dt) \ | |||||
case DTypeTrait<dt>::enumv: { \ | |||||
using ctype = DTypeTrait<dt>::ctype; \ | |||||
p_type epsilon = 0.000001f; \ | |||||
if (fabs(param().p - 0.0f) < epsilon) { \ | |||||
run_reduce<NormZeroOp<ctype, ctype, ctype>, false>( \ | |||||
workspace.ptr<ctype>(), A, B, C, stream, \ | |||||
NormZeroOp<ctype, ctype, ctype>( \ | |||||
src.ptr<ctype>(), dst.ptr<ctype>(), B)); \ | |||||
} else if (fabs(param().p - 1.0f) < epsilon) { \ | |||||
run_reduce<NormOneOp<ctype, ctype, ctype>, false>( \ | |||||
workspace.ptr<ctype>(), A, B, C, stream, \ | |||||
NormOneOp<ctype, ctype, ctype>( \ | |||||
src.ptr<ctype>(), dst.ptr<ctype>(), B)); \ | |||||
} else if (fabs(param().p - 2.0f) < epsilon) { \ | |||||
run_reduce<NormTwoOp<ctype, ctype, ctype>, false>( \ | |||||
workspace.ptr<ctype>(), A, B, C, stream, \ | |||||
NormTwoOp<ctype, ctype, ctype>( \ | |||||
src.ptr<ctype>(), dst.ptr<ctype>(), B)); \ | |||||
} else { \ | |||||
run_reduce<NormOp<ctype, ctype, ctype>, false>( \ | |||||
workspace.ptr<ctype>(), A, B, C, stream, \ | |||||
NormOp<ctype, ctype, ctype>( \ | |||||
src.ptr<ctype>(), dst.ptr<ctype>(), B, param().p)); \ | |||||
} \ | |||||
break; \ | |||||
}; | |||||
switch (src.layout.dtype.enumv()) { | |||||
CASE(::megdnn::dtype::Float32) | |||||
#if !MEGDNN_DISABLE_FLOAT16 | |||||
CASE(::megdnn::dtype::Float16) | |||||
#endif | |||||
default: | |||||
megdnn_assert_internal(false); | |||||
} | |||||
#undef CASE | |||||
} | |||||
} // namespace cuda | |||||
namespace cuda { | |||||
void NormForwardImpl::exec( | |||||
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { | |||||
check_exec(src.layout, dst.layout, workspace.size); | |||||
size_t A, B, C; | |||||
reduce::get_ABC(src.layout, A, B, C, param().dim); | |||||
auto stream = cuda_stream(this->handle()); | |||||
#define CASE(mode) \ | |||||
case mode: { \ | |||||
dispatch_mode<mode>(src, dst, workspace, A, B, C, stream); \ | |||||
break; \ | |||||
}; | |||||
switch (param().mode) { | |||||
CASE(Mode::P_NORM) | |||||
CASE(Mode::INF_NORM) | |||||
CASE(Mode::NEG_INF_NORM) | |||||
default: | |||||
megdnn_assert_internal(false); | |||||
} | |||||
#undef CASE | |||||
return; | |||||
} | |||||
size_t NormForwardImpl::get_workspace_in_bytes( | |||||
const TensorLayout& src, const TensorLayout& dst) { | |||||
using namespace device_reduce; | |||||
size_t A, B, C; | |||||
reduce::get_ABC(src, A, B, C, param().dim); | |||||
#define cb(dt, op) \ | |||||
case DTypeTrait<dt>::enumv: { \ | |||||
using ctype = DTypeTrait<dt>::ctype; \ | |||||
return get_reduce_workspace_in_bytes<op<ctype, ctype, ctype>>(A, B, C); \ | |||||
break; \ | |||||
}; | |||||
#if !MEGDNN_DISABLE_FLOAT16 | |||||
#define CASE(mode, op) \ | |||||
case mode: { \ | |||||
switch (src.dtype.enumv()) { \ | |||||
cb(::megdnn::dtype::Float32, op) cb(::megdnn::dtype::Float16, op) default \ | |||||
: megdnn_assert_internal(false); \ | |||||
} \ | |||||
}; | |||||
#else | |||||
#define CASE(mode, op) \ | |||||
case mode: { \ | |||||
switch (src.dtype.enumv()) { \ | |||||
cb(::megdnn::dtype::Float32, op) default : megdnn_assert_internal(false); \ | |||||
} \ | |||||
}; | |||||
#endif | |||||
// XXX: 0/1 norm dispathed to different Op, but workspace size same as | |||||
// NormOp | |||||
switch (param().mode) { | |||||
CASE(Mode::INF_NORM, MaxOp) | |||||
CASE(Mode::NEG_INF_NORM, MinOp) | |||||
CASE(Mode::P_NORM, NormOp) | |||||
default: | |||||
megdnn_assert_internal(false); | |||||
} | |||||
#undef CASE | |||||
#undef cb | |||||
} | |||||
} // namespace cuda | |||||
} // namespace megdnn |
@@ -0,0 +1,25 @@ | |||||
#pragma once | |||||
#include "megdnn/oprs.h" | |||||
#include "src/cuda/utils.h" | |||||
namespace megdnn { | |||||
namespace cuda { | |||||
class NormForwardImpl : public NormForward { | |||||
using Norm::Norm; | |||||
public: | |||||
void exec( | |||||
_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||||
_megdnn_workspace workspace) override; | |||||
size_t get_workspace_in_bytes( | |||||
const TensorLayout& src, const TensorLayout& dst) override; | |||||
protected: | |||||
template <Mode mode> | |||||
void dispatch_mode( | |||||
_megdnn_tensor_inout src, _megdnn_tensor_inout dst, | |||||
_megdnn_workspace workspace, size_t A, size_t B, size_t C, | |||||
cudaStream_t stream); | |||||
}; | |||||
} // namespace cuda | |||||
} // namespace megdnn |
@@ -51,6 +51,7 @@ | |||||
#include "src/naive/matrix_mul/opr_impl.h" | #include "src/naive/matrix_mul/opr_impl.h" | ||||
#include "src/naive/max_tensor_diff/opr_impl.h" | #include "src/naive/max_tensor_diff/opr_impl.h" | ||||
#include "src/naive/mesh_indexing/opr_impl.h" | #include "src/naive/mesh_indexing/opr_impl.h" | ||||
#include "src/naive/norm/opr_impl.h" | |||||
#include "src/naive/padding/opr_impl.h" | #include "src/naive/padding/opr_impl.h" | ||||
#include "src/naive/param_pack/opr_impl.h" | #include "src/naive/param_pack/opr_impl.h" | ||||
#include "src/naive/pooling/opr_impl.h" | #include "src/naive/pooling/opr_impl.h" | ||||
@@ -0,0 +1,152 @@ | |||||
#pragma once | |||||
#include <algorithm> | |||||
#include <numeric> | |||||
#include "megdnn/basic_types.h" | |||||
#include "megdnn/dtype.h" | |||||
#include "src/common/utils.h" | |||||
using namespace megdnn; | |||||
/* anonymous namespace */ | |||||
namespace { | |||||
using Mode = Reduce::Mode; | |||||
/* Reduce Trait */ | |||||
template <Mode mode, typename ctype> | |||||
struct Trait; | |||||
template <typename ctype> | |||||
struct Trait<Mode::SUM, ctype> { | |||||
static const ctype INIT; | |||||
static ctype apply(ctype x, ctype y) { return x + y; } | |||||
static ctype visit(ctype x) { return x; } | |||||
static ctype write(ctype x, size_t) { return x; } | |||||
}; | |||||
template <typename ctype> | |||||
const ctype Trait<Mode::SUM, ctype>::INIT = ctype(0); | |||||
template <typename ctype> | |||||
struct Trait<Mode::MEAN, ctype> { | |||||
static const ctype INIT; | |||||
static ctype apply(ctype x, ctype y) { return x + y; } | |||||
static ctype visit(ctype x) { return x; } | |||||
static ctype write(ctype x, size_t B) { return x / (ctype)B; } | |||||
}; | |||||
template <typename ctype> | |||||
const ctype Trait<Mode::MEAN, ctype>::INIT = ctype(0); | |||||
template <typename ctype> | |||||
struct Trait<Mode::SUM_SQR, ctype> { | |||||
static const ctype INIT; | |||||
static ctype apply(ctype x, ctype y) { return x + y; } | |||||
static ctype visit(ctype x) { return x * x; } | |||||
static ctype write(ctype x, size_t) { return x; } | |||||
}; | |||||
template <typename ctype> | |||||
const ctype Trait<Mode::SUM_SQR, ctype>::INIT = ctype(0); | |||||
template <typename ctype> | |||||
struct Trait<Mode::PRODUCT, ctype> { | |||||
static const ctype INIT; | |||||
static ctype apply(ctype x, ctype y) { return x * y; } | |||||
static ctype visit(ctype x) { return x; } | |||||
static ctype write(ctype x, size_t) { return x; } | |||||
}; | |||||
template <typename ctype> | |||||
const ctype Trait<Mode::PRODUCT, ctype>::INIT = ctype(1); | |||||
template <typename ctype> | |||||
struct Trait<Mode::MIN, ctype> { | |||||
static ctype apply(ctype x, ctype y) { return x < y ? x : y; } | |||||
static ctype visit(ctype x) { return x; } | |||||
static ctype write(ctype x, size_t) { return x; } | |||||
}; | |||||
template <> | |||||
struct Trait<Mode::MIN, dt_float32> { | |||||
using ctype = dt_float32; | |||||
static ctype apply(ctype x, ctype y) { return (std::isnan(x) || x < y) ? x : y; } | |||||
static ctype visit(ctype x) { return x; } | |||||
static ctype write(ctype x, size_t) { return x; } | |||||
}; | |||||
template <typename ctype> | |||||
struct Trait<Mode::MAX, ctype> { | |||||
static ctype apply(ctype x, ctype y) { return x > y ? x : y; } | |||||
static ctype visit(ctype x) { return x; } | |||||
static ctype write(ctype x, size_t) { return x; } | |||||
}; | |||||
template <> | |||||
struct Trait<Mode::MAX, dt_float32> { | |||||
using ctype = dt_float32; | |||||
static ctype apply(ctype x, ctype y) { return (std::isnan(x) || x > y) ? x : y; } | |||||
static ctype visit(ctype x) { return x; } | |||||
static ctype write(ctype x, size_t) { return x; } | |||||
}; | |||||
/* NormOp */ | |||||
template <typename ctype> | |||||
struct NormOp; | |||||
template <> | |||||
struct NormOp<dt_float32> { | |||||
typedef dt_float32 ctype; | |||||
static const ctype INIT; | |||||
static ctype apply(ctype x, ctype y) { return x + y; } | |||||
static ctype visit(ctype x, dt_float32 p) { return powf(fabs(x), p); } | |||||
static ctype write(ctype x, size_t, dt_float32 p) { return powf(x, 1.f / p); } | |||||
}; | |||||
#if !MEGDNN_DISABLE_FLOAT16 | |||||
template <> | |||||
struct NormOp<dt_float16> { | |||||
typedef dt_float16 ctype; | |||||
static const ctype INIT; | |||||
static ctype apply(ctype x, ctype y) { return x + y; } | |||||
static ctype visit(ctype x, dt_float32 p) { | |||||
return half_float::pow(half_float::abs(x), half_float::half(p)); | |||||
} | |||||
static ctype write(ctype x, size_t, dt_float32 p) { | |||||
return half_float::pow(x, half_float::half(1.f / p)); | |||||
} | |||||
}; | |||||
#endif | |||||
template <typename ctype> | |||||
struct NormZeroOp; | |||||
template <> | |||||
struct NormZeroOp<dt_float32> { | |||||
typedef dt_float32 ctype; | |||||
static const ctype INIT; | |||||
static ctype apply(ctype x, ctype y) { return x + y; } | |||||
static ctype visit(ctype x) { return x - 0.f < 0.00001f ? 0.f : 1.f; } | |||||
static ctype write(ctype x, size_t) { return x; } | |||||
}; | |||||
#if !MEGDNN_DISABLE_FLOAT16 | |||||
template <> | |||||
struct NormZeroOp<dt_float16> { | |||||
typedef dt_float16 ctype; | |||||
static const ctype INIT; | |||||
static ctype apply(ctype x, ctype y) { return x + y; } | |||||
static ctype visit(ctype x) { | |||||
return x - half_float::half(0.f) < half_float::half(0.00001f) | |||||
? half_float::half(0.f) | |||||
: half_float::half(1.f); | |||||
} | |||||
static ctype write(ctype x, size_t) { return x; } | |||||
}; | |||||
#endif | |||||
} // namespace |
@@ -0,0 +1,197 @@ | |||||
#include "src/naive/norm/opr_impl.h" | |||||
#include "helper.h" | |||||
#include "src/common/utils.h" | |||||
#include "src/naive/handle.h" | |||||
namespace megdnn { | |||||
namespace naive { | |||||
using Mode = Norm::Mode; | |||||
template <> | |||||
void NormForwardImpl::dispatch_mode<Mode::NEG_INF_NORM>( | |||||
_megdnn_tensor_in src, _megdnn_tensor_out dst, size_t A, size_t B, size_t C) { | |||||
#define CASE(dt) \ | |||||
case DTypeTrait<dt>::enumv: { \ | |||||
using ctype = DTypeTrait<dt>::ctype; \ | |||||
const ctype* __restrict sptr = src.ptr<ctype>(); \ | |||||
ctype* __restrict dptr = dst.ptr<ctype>(); \ | |||||
std::function<ctype(size_t, size_t, size_t, size_t)> func; \ | |||||
func = [&](size_t a, size_t c, size_t bl, size_t br) -> ctype { \ | |||||
if (bl + 1 < br) { \ | |||||
size_t mid = bl + (br - bl) / 2; \ | |||||
return Trait<ReduceForward::Mode::MIN, ctype>::apply( \ | |||||
func(a, c, bl, mid), func(a, c, mid, br)); \ | |||||
} else { \ | |||||
return Trait<ReduceForward::Mode::MIN, ctype>::visit( \ | |||||
sptr[a * B * C + bl * C + c]); \ | |||||
} \ | |||||
}; \ | |||||
for (size_t a = 0; a < A; ++a) \ | |||||
for (size_t c = 0; c < C; ++c) { \ | |||||
dptr[a * C + c] = Trait<ReduceForward::Mode::MIN, ctype>::write( \ | |||||
func(a, c, 0, B), B); \ | |||||
} \ | |||||
break; \ | |||||
}; | |||||
switch (src.layout.dtype.enumv()) { | |||||
CASE(::megdnn::dtype::Float32) | |||||
#if !MEGDNN_DISABLE_FLOAT16 | |||||
CASE(::megdnn::dtype::Float16) | |||||
#endif | |||||
default: | |||||
megdnn_assert_internal(false); | |||||
} | |||||
#undef CASE | |||||
} | |||||
template <> | |||||
void NormForwardImpl::dispatch_mode<Mode::INF_NORM>( | |||||
_megdnn_tensor_in src, _megdnn_tensor_out dst, size_t A, size_t B, size_t C) { | |||||
#define CASE(dt) \ | |||||
case DTypeTrait<dt>::enumv: { \ | |||||
using ctype = DTypeTrait<dt>::ctype; \ | |||||
const ctype* __restrict sptr = src.ptr<ctype>(); \ | |||||
ctype* __restrict dptr = dst.ptr<ctype>(); \ | |||||
std::function<ctype(size_t, size_t, size_t, size_t)> func; \ | |||||
func = [&](size_t a, size_t c, size_t bl, size_t br) -> ctype { \ | |||||
if (bl + 1 < br) { \ | |||||
size_t mid = bl + (br - bl) / 2; \ | |||||
return Trait<ReduceForward::Mode::MAX, ctype>::apply( \ | |||||
func(a, c, bl, mid), func(a, c, mid, br)); \ | |||||
} else { \ | |||||
return Trait<ReduceForward::Mode::MAX, ctype>::visit( \ | |||||
sptr[a * B * C + bl * C + c]); \ | |||||
} \ | |||||
}; \ | |||||
for (size_t a = 0; a < A; ++a) \ | |||||
for (size_t c = 0; c < C; ++c) { \ | |||||
dptr[a * C + c] = Trait<ReduceForward::Mode::MAX, ctype>::write( \ | |||||
func(a, c, 0, B), B); \ | |||||
} \ | |||||
break; \ | |||||
}; | |||||
switch (src.layout.dtype.enumv()) { | |||||
CASE(::megdnn::dtype::Float32) | |||||
#if !MEGDNN_DISABLE_FLOAT16 | |||||
CASE(::megdnn::dtype::Float16) | |||||
#endif | |||||
default: | |||||
megdnn_assert_internal(false); | |||||
} | |||||
#undef CASE | |||||
} | |||||
template <> | |||||
void NormForwardImpl::dispatch_mode<Mode::P_NORM>( | |||||
_megdnn_tensor_in src, _megdnn_tensor_out dst, size_t A, size_t B, size_t C) { | |||||
#define CASE(dt) \ | |||||
case DTypeTrait<dt>::enumv: { \ | |||||
using ctype = DTypeTrait<dt>::ctype; \ | |||||
const ctype* __restrict sptr = src.ptr<ctype>(); \ | |||||
ctype* __restrict dptr = dst.ptr<ctype>(); \ | |||||
std::function<ctype(size_t, size_t, size_t, size_t)> func; \ | |||||
if (param().p - 0.f < 0.00001f) { \ | |||||
func = [&](size_t a, size_t c, size_t bl, size_t br) -> ctype { \ | |||||
if (bl + 1 < br) { \ | |||||
size_t mid = bl + (br - bl) / 2; \ | |||||
return NormZeroOp<ctype>::apply( \ | |||||
func(a, c, bl, mid), func(a, c, mid, br)); \ | |||||
} else { \ | |||||
return NormZeroOp<ctype>::visit(sptr[a * B * C + bl * C + c]); \ | |||||
} \ | |||||
}; \ | |||||
for (size_t a = 0; a < A; ++a) { \ | |||||
for (size_t c = 0; c < C; ++c) { \ | |||||
dptr[a * C + c] = NormZeroOp<ctype>::write(func(a, c, 0, B), B); \ | |||||
} \ | |||||
} \ | |||||
} else { \ | |||||
func = [&](size_t a, size_t c, size_t bl, size_t br) -> ctype { \ | |||||
if (bl + 1 < br) { \ | |||||
size_t mid = bl + (br - bl) / 2; \ | |||||
return NormOp<ctype>::apply( \ | |||||
func(a, c, bl, mid), func(a, c, mid, br)); \ | |||||
} else { \ | |||||
return NormOp<ctype>::visit( \ | |||||
sptr[a * B * C + bl * C + c], param().p); \ | |||||
} \ | |||||
}; \ | |||||
for (size_t a = 0; a < A; ++a) { \ | |||||
for (size_t c = 0; c < C; ++c) { \ | |||||
dptr[a * C + c] = \ | |||||
NormOp<ctype>::write(func(a, c, 0, B), B, param().p); \ | |||||
} \ | |||||
} \ | |||||
} \ | |||||
break; \ | |||||
}; | |||||
switch (src.layout.dtype.enumv()) { | |||||
CASE(::megdnn::dtype::Float32) | |||||
#if !MEGDNN_DISABLE_FLOAT16 | |||||
CASE(::megdnn::dtype::Float16) | |||||
#endif | |||||
default: | |||||
megdnn_assert_internal(false); | |||||
} | |||||
#undef CASE | |||||
} | |||||
void NormForwardImpl::exec( | |||||
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { | |||||
check_exec(src.layout, dst.layout, workspace.size); | |||||
using namespace reduce; | |||||
size_t A, B, C; | |||||
reduce::get_ABC(src.layout, A, B, C, param().dim); | |||||
auto make_tensor = [&](DType comp_dtype, _megdnn_tensor_inout tensor, | |||||
dt_byte*& workspace_ptr) { | |||||
if (comp_dtype == tensor.layout.dtype) | |||||
return tensor; | |||||
auto layout = TensorLayout(tensor.layout, comp_dtype); | |||||
TensorND new_tensor{workspace_ptr, layout}; | |||||
workspace_ptr += layout.span().dist_byte(); | |||||
return new_tensor; | |||||
}; | |||||
auto typecvt = handle()->create_operator<TypeCvt>(); | |||||
auto copy_to = [&typecvt](const TensorND& from, const TensorND& to) { | |||||
if (from.raw_ptr() != to.raw_ptr()) | |||||
typecvt->exec(from, to); | |||||
}; | |||||
auto workspace_ptr = workspace.ptr<dt_byte>(); | |||||
auto new_src = make_tensor(src.layout.dtype, src, workspace_ptr); | |||||
auto new_dst = make_tensor(dst.layout.dtype, dst, workspace_ptr); | |||||
#define CASE(mode) \ | |||||
case mode: { \ | |||||
copy_to(src, new_src); \ | |||||
::megdnn::naive::HandleImpl* handlePtr = static_cast<HandleImpl*>(handle()); \ | |||||
MEGDNN_DISPATCH_CPU_KERN( \ | |||||
handlePtr, dispatch_mode<mode>(new_src, new_dst, A, B, C)); \ | |||||
copy_to(new_dst, dst); \ | |||||
break; \ | |||||
}; | |||||
switch (param().mode) { | |||||
CASE(Mode::P_NORM) | |||||
CASE(Mode::INF_NORM) | |||||
CASE(Mode::NEG_INF_NORM) | |||||
default: | |||||
megdnn_assert_internal(false); | |||||
} | |||||
#undef CASE | |||||
} | |||||
size_t NormForwardImpl::get_workspace_in_bytes( | |||||
const TensorLayout& src, const TensorLayout& dst) { | |||||
MEGDNN_MARK_USED_VAR(src); | |||||
MEGDNN_MARK_USED_VAR(dst); | |||||
return 0; | |||||
} | |||||
} // namespace naive | |||||
} // namespace megdnn |
@@ -0,0 +1,23 @@ | |||||
#pragma once | |||||
#include "megdnn/oprs.h" | |||||
#include "src/common/reduce_helper.h" | |||||
#include "src/naive/reduce/opr_impl.h" | |||||
namespace megdnn { | |||||
namespace naive { | |||||
class NormForwardImpl : public Norm { | |||||
public: | |||||
using Norm::Norm; | |||||
void exec( | |||||
_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||||
_megdnn_workspace workspace) override; | |||||
size_t get_workspace_in_bytes( | |||||
const TensorLayout& src, const TensorLayout& dst) override; | |||||
protected: | |||||
template <Mode mode> | |||||
void dispatch_mode( | |||||
_megdnn_tensor_in src, _megdnn_tensor_out dst, size_t, size_t, size_t); | |||||
}; | |||||
} // namespace naive | |||||
} // namespace megdnn |
@@ -0,0 +1,19 @@ | |||||
#pragma once | |||||
#include <iostream> | |||||
#include "megdnn/basic_types.h" | |||||
#include "megdnn/opr_param_defs.h" | |||||
namespace megdnn { | |||||
namespace test { | |||||
namespace norm { | |||||
struct TestArg { | |||||
param::Norm param; | |||||
TensorShape src; | |||||
TestArg(param::Norm param, TensorShape src) : param(param), src(src) {} | |||||
}; | |||||
} // namespace norm | |||||
} // namespace test | |||||
} // namespace megdnn |
@@ -0,0 +1,291 @@ | |||||
#include "test/common/norm.h" | |||||
#include "megdnn/dtype.h" | |||||
#include "megdnn/oprs.h" | |||||
#include "test/common/checker.h" | |||||
// #include "test/naive/fixture.h" | |||||
// #include "test/common/benchmarker.h" | |||||
#include <iostream> | |||||
#include "test/cuda/benchmark.h" | |||||
#include "test/cuda/fixture.h" | |||||
#include "test/cuda/utils.h" | |||||
namespace megdnn { | |||||
namespace test { | |||||
// CORRECT | |||||
// L2, fp32, dim | |||||
TEST_F(CUDA, L2NORM_FP32_DIM0) { | |||||
Checker<Norm> checker(handle_cuda()); | |||||
Norm::Param param; | |||||
param.p = 2; | |||||
param.dim = 0; | |||||
checker.set_param(param); | |||||
checker.exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float32(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float32(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
}); | |||||
} | |||||
TEST_F(CUDA, L2NORM_FP32_DIM1) { | |||||
Checker<Norm> checker(handle_cuda()); | |||||
Norm::Param param; | |||||
param.p = 2; | |||||
param.dim = 1; | |||||
checker.set_param(param); | |||||
checker.exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float32(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue( | |||||
{1, 1, 3, 4}, dtype::Float32(), | |||||
{12.000, 13.0384, 14.1421, 15.2971, 16.4924, 17.7200, | |||||
18.9737, 20.2485, 21.5407, 22.8473, 24.1661, 25.4951}), | |||||
}); | |||||
} | |||||
TEST_F(CUDA, L2NORM_FP32_DIM3) { | |||||
Checker<Norm> checker(handle_cuda()); | |||||
Norm::Param param; | |||||
param.p = 2; | |||||
param.dim = 3; | |||||
checker.set_param(param).exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float32(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue( | |||||
{1, 2, 3, 1}, dtype::Float32(), | |||||
{3.7417, 11.2250, 19.1311, 27.0924, 35.0714, 43.0581})}); | |||||
} | |||||
// TODO: support -1 dim param, or test for assert | |||||
// l2, fp16 | |||||
TEST_F(CUDA, L2NORM_FP16_DIM3) { | |||||
Checker<Norm> checker(handle_cuda()); | |||||
Norm::Param param; | |||||
param.p = 2; | |||||
param.dim = 3; | |||||
checker.set_param(param).exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float16(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue( | |||||
{1, 2, 3, 1}, dtype::Float16(), | |||||
{3.7422, 11.2266, 19.1250, 27.0938, 35.0625, 43.0625})}); | |||||
} | |||||
// l1, fp32,fp16 | |||||
TEST_F(CUDA, L1NORM_FP32_DIM3) { | |||||
Checker<Norm> checker(handle_cuda()); | |||||
Norm::Param param; | |||||
param.p = 1; | |||||
param.dim = 3; | |||||
checker.set_param(param).exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float32(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue( | |||||
{1, 2, 3, 1}, dtype::Float32(), {6, 22, 38, 54, 70, 86}), | |||||
}); | |||||
} | |||||
TEST_F(CUDA, L1NORM_FP16_DIM3) { | |||||
Checker<Norm> checker(handle_cuda()); | |||||
Norm::Param param; | |||||
param.p = 1; | |||||
param.dim = 3; | |||||
checker.set_param(param).exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float16(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue( | |||||
{1, 2, 3, 1}, dtype::Float16(), {6, 22, 38, 54, 70, 86}), | |||||
}); | |||||
} | |||||
// l0, fp32,fp16 | |||||
TEST_F(CUDA, L0NORM_FP32_DIM3) { | |||||
Checker<Norm> checker(handle_cuda()); | |||||
Norm::Param param; | |||||
param.p = 0; | |||||
param.dim = 3; | |||||
checker.set_param(param).exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float32(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue({1, 2, 3, 1}, dtype::Float32(), {3, 4, 4, 4, 4, 4}), | |||||
}); | |||||
} | |||||
TEST_F(CUDA, L0NORM_FP16_DIM3) { | |||||
Checker<Norm> checker(handle_cuda()); | |||||
Norm::Param param; | |||||
param.p = 0; | |||||
param.dim = 3; | |||||
checker.set_param(param).exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float16(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue({1, 2, 3, 1}, dtype::Float16(), {3, 4, 4, 4, 4, 4}), | |||||
}); | |||||
} | |||||
// inf | |||||
TEST_F(CUDA, INF_NORM_FP32_DIM3) { | |||||
Checker<Norm> checker(handle_cuda()); | |||||
Norm::Param param; | |||||
using Mode = Norm::Param::Mode; | |||||
param.dim = 3; | |||||
param.mode = Mode::INF_NORM; | |||||
checker.set_param(param).exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float32(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue({1, 2, 3, 1}, dtype::Float32(), {3, 7, 11, 15, 19, 23}), | |||||
}); | |||||
} | |||||
TEST_F(CUDA, INF_NORM_FP16_DIM3) { | |||||
Checker<Norm> checker(handle_cuda()); | |||||
Norm::Param param; | |||||
using Mode = Norm::Param::Mode; | |||||
param.dim = 3; | |||||
param.mode = Mode::INF_NORM; | |||||
checker.set_param(param).exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float16(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue({1, 2, 3, 1}, dtype::Float16(), {3, 7, 11, 15, 19, 23}), | |||||
}); | |||||
} | |||||
// -inf | |||||
TEST_F(CUDA, NEG_INF_NORM_FP32_DIM3) { | |||||
Checker<Norm> checker(handle_cuda()); | |||||
Norm::Param param; | |||||
param.mode = Norm::Param::Mode::NEG_INF_NORM; | |||||
param.dim = 3; | |||||
checker.set_param(param).exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float32(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue({1, 2, 3, 1}, dtype::Float32(), {0, 4, 8, 12, 16, 20}), | |||||
}); | |||||
} | |||||
TEST_F(CUDA, NEG_INF_NORM_FP16_DIM3) { | |||||
Checker<Norm> checker(handle_cuda()); | |||||
Norm::Param param; | |||||
param.mode = Norm::Param::Mode::NEG_INF_NORM; | |||||
param.dim = 3; | |||||
checker.set_param(param).exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float16(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue({1, 2, 3, 1}, dtype::Float16(), {0, 4, 8, 12, 16, 20}), | |||||
}); | |||||
} | |||||
// PERF | |||||
TEST_F(CUDA, L2NORM_SPEED_FP32) { | |||||
auto benchmarker = Benchmarker<Norm>(handle_cuda()); | |||||
benchmarker.set_dtype(0, dtype::Float32()); | |||||
benchmarker.set_dtype(1, dtype::Float32()); | |||||
Norm::Param param; | |||||
param.mode = Norm::Param::Mode::P_NORM; | |||||
param.dim = 0; | |||||
param.p = 2; | |||||
SmallVector<TensorShape> shapes{{4194304}, {}}; | |||||
NormalRNG rng(0, 1); | |||||
float eachTime; | |||||
float totalTime = 0.f; | |||||
#define ITER 10 | |||||
for (auto i = 0; i < ITER; i++) { | |||||
eachTime = benchmarker.set_param(param).set_rng(0, &rng).exec(shapes); | |||||
// printf("PNORM_SPEED_FP32 cuda time: %.6fms\n", eachTime); | |||||
totalTime += eachTime; | |||||
} | |||||
totalTime /= ITER; | |||||
printf("PNORM_SPEED_FP32 AVG TIME: %.6fms\n", totalTime); | |||||
#undef ITER | |||||
} | |||||
TEST_F(CUDA, INFNORM_SPEED_FP32) { | |||||
auto benchmarker = Benchmarker<Norm>(handle_cuda()); | |||||
benchmarker.set_dtype(0, dtype::Float32()); | |||||
benchmarker.set_dtype(1, dtype::Float32()); | |||||
Norm::Param param; | |||||
param.mode = Norm::Param::Mode::INF_NORM; | |||||
param.dim = 0; | |||||
SmallVector<TensorShape> shapes{{4194304}, {}}; | |||||
NormalRNG rng(0, 1); | |||||
float time_fp32 = benchmarker.set_param(param).set_rng(0, &rng).exec(shapes); | |||||
printf("INF_SPEED_FP32 cuda time: float=%.6fms\n", time_fp32); | |||||
} | |||||
TEST_F(CUDA, NEG_INFNORM_SPEED_FP32) { | |||||
auto benchmarker = Benchmarker<Norm>(handle_cuda()); | |||||
benchmarker.set_dtype(0, dtype::Float32()); | |||||
benchmarker.set_dtype(1, dtype::Float32()); | |||||
Norm::Param param; | |||||
param.mode = Norm::Param::Mode::NEG_INF_NORM; | |||||
param.dim = 0; | |||||
SmallVector<TensorShape> shapes{{4194304}, {}}; | |||||
NormalRNG rng(0, 1); | |||||
float time_fp32 = benchmarker.set_param(param).set_rng(0, &rng).exec(shapes); | |||||
printf("NEG_INF_SPEED_FP32 cuda time: float=%.6fms\n", time_fp32); | |||||
} | |||||
} // namespace test | |||||
} // namespace megdnn |
@@ -0,0 +1,237 @@ | |||||
#include "test/common/norm.h" | |||||
#include "megdnn/dtype.h" | |||||
#include "megdnn/oprs.h" | |||||
#include "test/common/benchmarker.h" | |||||
#include "test/common/checker.h" | |||||
#include "test/naive/fixture.h" | |||||
namespace megdnn { | |||||
namespace test { | |||||
TEST_F(NAIVE, L2NORM_FP32_DIM0) { | |||||
Checker<Norm> checker(handle(), false); | |||||
Norm::Param param; | |||||
param.p = 2; | |||||
param.dim = 0; | |||||
checker.set_param(param); | |||||
checker.exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float32(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float32(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
}); | |||||
} | |||||
TEST_F(NAIVE, L2NORM_FP32_DIM1) { | |||||
Checker<Norm> checker(handle()); | |||||
Norm::Param param; | |||||
param.p = 2; | |||||
param.dim = 1; | |||||
checker.set_param(param); | |||||
checker.exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float32(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue( | |||||
{1, 1, 3, 4}, dtype::Float32(), | |||||
{12.000, 13.0384, 14.1421, 15.2971, 16.4924, 17.7200, | |||||
18.9737, 20.2485, 21.5407, 22.8473, 24.1661, 25.4951}), | |||||
}); | |||||
} | |||||
TEST_F(NAIVE, L2NORM_FP32_DIM3) { | |||||
Checker<Norm> checker(handle()); | |||||
Norm::Param param; | |||||
param.p = 2; | |||||
param.dim = 3; | |||||
checker.set_param(param).exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float32(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue( | |||||
{1, 2, 3, 1}, dtype::Float32(), | |||||
{3.7417, 11.2250, 19.1311, 27.0924, 35.0714, 43.0581})}); | |||||
} | |||||
// l2, fp16 | |||||
TEST_F(NAIVE, L2NORM_FP16_DIM3) { | |||||
Checker<Norm> checker(handle()); | |||||
Norm::Param param; | |||||
param.p = 2; | |||||
param.dim = 3; | |||||
checker.set_param(param).exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float16(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue( | |||||
{1, 2, 3, 1}, dtype::Float16(), | |||||
{3.7422, 11.2266, 19.1250, 27.0938, 35.0625, 43.0625})}); | |||||
} | |||||
// l1, fp32,fp16 | |||||
TEST_F(NAIVE, L1NORM_FP32_DIM3) { | |||||
Checker<Norm> checker(handle()); | |||||
Norm::Param param; | |||||
param.p = 1; | |||||
param.dim = 3; | |||||
checker.set_param(param).exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float32(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue( | |||||
{1, 2, 3, 1}, dtype::Float32(), {6, 22, 38, 54, 70, 86}), | |||||
}); | |||||
} | |||||
TEST_F(NAIVE, L1NORM_FP16_DIM3) { | |||||
Checker<Norm> checker(handle()); | |||||
Norm::Param param; | |||||
param.p = 1; | |||||
param.dim = 3; | |||||
checker.set_param(param).exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float16(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue( | |||||
{1, 2, 3, 1}, dtype::Float16(), {6, 22, 38, 54, 70, 86}), | |||||
}); | |||||
} | |||||
// l0, fp32,fp16 | |||||
TEST_F(NAIVE, L0NORM_FP32_DIM3) { | |||||
Checker<Norm> checker(handle()); | |||||
Norm::Param param; | |||||
param.p = 0; | |||||
param.dim = 3; | |||||
checker.set_param(param).exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float32(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue({1, 2, 3, 1}, dtype::Float32(), {3, 4, 4, 4, 4, 4}), | |||||
}); | |||||
} | |||||
TEST_F(NAIVE, L0NORM_FP16_DIM3) { | |||||
Checker<Norm> checker(handle()); | |||||
Norm::Param param; | |||||
param.p = 0; | |||||
param.dim = 3; | |||||
checker.set_param(param).exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float16(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue({1, 2, 3, 1}, dtype::Float16(), {3, 4, 4, 4, 4, 4}), | |||||
}); | |||||
} | |||||
// inf | |||||
TEST_F(NAIVE, INF_NORM_FP32_DIM3) { | |||||
Checker<Norm> checker(handle()); | |||||
Norm::Param param; | |||||
using Mode = Norm::Param::Mode; | |||||
param.dim = 3; | |||||
param.mode = Mode::INF_NORM; | |||||
checker.set_param(param).exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float32(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue({1, 2, 3, 1}, dtype::Float32(), {3, 7, 11, 15, 19, 23}), | |||||
}); | |||||
} | |||||
TEST_F(NAIVE, INF_NORM_FP16_DIM3) { | |||||
Checker<Norm> checker(handle()); | |||||
Norm::Param param; | |||||
using Mode = Norm::Param::Mode; | |||||
param.dim = 3; | |||||
param.mode = Mode::INF_NORM; | |||||
checker.set_param(param).exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float16(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue({1, 2, 3, 1}, dtype::Float16(), {3, 7, 11, 15, 19, 23}), | |||||
}); | |||||
} | |||||
// -inf | |||||
TEST_F(NAIVE, NEG_INF_NORM_FP32_DIM3) { | |||||
Checker<Norm> checker(handle()); | |||||
Norm::Param param; | |||||
param.mode = Norm::Param::Mode::NEG_INF_NORM; | |||||
param.dim = 3; | |||||
checker.set_param(param).exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float32(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue({1, 2, 3, 1}, dtype::Float32(), {0, 4, 8, 12, 16, 20}), | |||||
}); | |||||
} | |||||
TEST_F(NAIVE, NEG_INF_NORM_FP16_DIM3) { | |||||
Checker<Norm> checker(handle()); | |||||
Norm::Param param; | |||||
param.mode = Norm::Param::Mode::NEG_INF_NORM; | |||||
param.dim = 3; | |||||
checker.set_param(param).exect( | |||||
Testcase{ | |||||
TensorValue( | |||||
{1, 2, 3, 4}, dtype::Float16(), | |||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, | |||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}), | |||||
{}}, | |||||
Testcase{ | |||||
{}, | |||||
TensorValue({1, 2, 3, 1}, dtype::Float16(), {0, 4, 8, 12, 16, 20}), | |||||
}); | |||||
} | |||||
} // namespace test | |||||
} // namespace megdnn |