* Adapt to the new version llvm/clang-11
* fix windows bazel build with cuda
* add windows bazel build cuda ci
* opt windows bazel ci scripts
GitOrigin-RevId: 6ea7c66585
release-1.2
@@ -26,7 +26,7 @@ public: | |||||
size_t get_workspace_in_bytes(const TensorLayout& /*src*/, | size_t get_workspace_in_bytes(const TensorLayout& /*src*/, | ||||
const TensorLayout& /*mask_offset*/, | const TensorLayout& /*mask_offset*/, | ||||
const TensorLayout& /*mask_val*/, | const TensorLayout& /*mask_val*/, | ||||
const TensorLayout& /*dst*/) { | |||||
const TensorLayout& /*dst*/) override { | |||||
return 0; | return 0; | ||||
}; | }; | ||||
void set_error_tracker(void* tracker) override { | void set_error_tracker(void* tracker) override { | ||||
@@ -43,7 +43,7 @@ public: | |||||
const TensorLayout& /* out_diff */, | const TensorLayout& /* out_diff */, | ||||
const TensorLayout& /* out_count */, | const TensorLayout& /* out_count */, | ||||
const TensorLayout& /* data_diff */, | const TensorLayout& /* data_diff */, | ||||
const TensorLayout& /* trans_diff */) { | |||||
const TensorLayout& /* trans_diff */) override { | |||||
return 0ULL; | return 0ULL; | ||||
}; | }; | ||||
@@ -192,13 +192,11 @@ void megdnn::cuda::exec_span_qsum(const int32_t* qSumA, const uint32_t M, | |||||
int32_t* dst, const uint32_t strd, | int32_t* dst, const uint32_t strd, | ||||
const int32_t scaler_bias, | const int32_t scaler_bias, | ||||
cudaStream_t stream) { | cudaStream_t stream) { | ||||
constexpr size_t TX = 32, TY = 32; | |||||
constexpr size_t BX = 32, BY = 32; | |||||
constexpr uint32_t TX = 32, TY = 32, BX = 32, BY = 32; | |||||
dim3 nthreads{TX, TY}; | dim3 nthreads{TX, TY}; | ||||
dim3 nblocks{static_cast<uint32_t>(DIVUP(N, BX)), | |||||
static_cast<uint32_t>(DIVUP(M, BY))}; | |||||
span_qsum<TX, TY, BX, BY><<<nblocks, nthreads, 0, stream>>>(qSumA, M, qSumB, N, dst, strd, | |||||
scaler_bias); | |||||
dim3 nblocks{DIVUP(N, BX), DIVUP(M, BY)}; | |||||
span_qsum<TX, TY, BX, BY><<<nblocks, nthreads, 0, stream>>>( | |||||
qSumA, M, qSumB, N, dst, strd, scaler_bias); | |||||
after_kernel_launch(); | after_kernel_launch(); | ||||
} | } | ||||
@@ -46,10 +46,7 @@ using namespace wmma::experimental::precision; | |||||
namespace wmma_matrix_mul_u4 { | namespace wmma_matrix_mul_u4 { | ||||
constexpr size_t WMMA_M = 8; | |||||
constexpr size_t WMMA_N = 8; | |||||
constexpr size_t WMMA_K = 32; | |||||
constexpr size_t WARP_SIZE = 32; | |||||
constexpr uint32_t WMMA_M = 8, WMMA_N = 8, WMMA_K = 32, WARP_SIZE = 32; | |||||
template <size_t WARP_X_, size_t WARP_Y_, size_t ROW_PER_WARP_, | template <size_t WARP_X_, size_t WARP_Y_, size_t ROW_PER_WARP_, | ||||
size_t COL_PER_WARP_> | size_t COL_PER_WARP_> | ||||
@@ -334,10 +331,8 @@ __global__ void u4_gemm_template_device_nt(const uint8_t* /*A*/, | |||||
void _do_dispatch_wmma_matrix_mul_u4(const uint8_t* A, const uint8_t* B, | void _do_dispatch_wmma_matrix_mul_u4(const uint8_t* A, const uint8_t* B, | ||||
int32_t* C, int M, int N, int K, int lda, | int32_t* C, int M, int N, int K, int lda, | ||||
int ldb, int ldc, cudaStream_t stream) { | int ldb, int ldc, cudaStream_t stream) { | ||||
constexpr size_t warp_x = 4; | |||||
constexpr size_t warp_y = 4; | |||||
constexpr size_t row_per_warp = 4; | |||||
constexpr size_t col_per_warp = 4; | |||||
constexpr uint32_t warp_x = 4, warp_y = 4, row_per_warp = 4, | |||||
col_per_warp = 4; | |||||
typedef BlockConfig<warp_x, warp_y, row_per_warp, col_per_warp> | typedef BlockConfig<warp_x, warp_y, row_per_warp, col_per_warp> | ||||
BlockConfig_; | BlockConfig_; | ||||
dim3 block{warp_x * WARP_SIZE, warp_y}; | dim3 block{warp_x * WARP_SIZE, warp_y}; | ||||
@@ -110,8 +110,10 @@ void do_cvt_normal_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) { | |||||
float scale = dst.layout.dtype.param<dtype::QuantizedS32>().scale; | float scale = dst.layout.dtype.param<dtype::QuantizedS32>().scale; | ||||
float dscale = 1.f / scale; | float dscale = 1.f / scale; | ||||
for (size_t i = 0; i < n; ++i) { | for (size_t i = 0; i < n; ++i) { | ||||
dptr[i] = saturate<int32_t, float>(std::round(sptr[i] * dscale), | |||||
-2147483648, 2147483647); | |||||
dptr[i] = saturate<int32_t, float>( | |||||
std::round(sptr[i] * dscale), | |||||
static_cast<float>(std::numeric_limits<int32_t>::min()), | |||||
static_cast<float>(std::numeric_limits<int32_t>::max())); | |||||
} | } | ||||
} | } | ||||
@@ -219,8 +221,10 @@ void do_cvt_s8_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) { | |||||
float dst_scale = dst.layout.dtype.param<dtype::QuantizedS32>().scale; | float dst_scale = dst.layout.dtype.param<dtype::QuantizedS32>().scale; | ||||
float scale = src_scale / dst_scale; | float scale = src_scale / dst_scale; | ||||
for (size_t i = 0; i < n; ++i) { | for (size_t i = 0; i < n; ++i) { | ||||
dptr[i] = saturate<int32_t, float>(std::round(sptr[i] * scale), | |||||
-2147483648, 2147483647); | |||||
dptr[i] = saturate<int32_t, float>( | |||||
std::round(sptr[i] * scale), | |||||
static_cast<float>(std::numeric_limits<int32_t>::min()), | |||||
static_cast<float>(std::numeric_limits<int32_t>::max())); | |||||
} | } | ||||
} | } | ||||
@@ -232,8 +236,10 @@ void do_cvt_s32_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) { | |||||
float dst_scale = dst.layout.dtype.param<dtype::QuantizedS32>().scale; | float dst_scale = dst.layout.dtype.param<dtype::QuantizedS32>().scale; | ||||
float scale = src_scale / dst_scale; | float scale = src_scale / dst_scale; | ||||
for (size_t i = 0; i < n; ++i) { | for (size_t i = 0; i < n; ++i) { | ||||
dptr[i] = saturate<int32_t, float>(std::round(sptr[i] * scale), | |||||
-2147483648, 2147483647); | |||||
dptr[i] = saturate<int32_t, float>( | |||||
std::round(sptr[i] * scale), | |||||
static_cast<float>(std::numeric_limits<int32_t>::min()), | |||||
static_cast<float>(std::numeric_limits<int32_t>::max())); | |||||
} | } | ||||
} | } | ||||
@@ -247,9 +253,10 @@ void do_cvt_asymm8_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) { | |||||
float dst_scale = dst.layout.dtype.param<dtype::QuantizedS32>().scale; | float dst_scale = dst.layout.dtype.param<dtype::QuantizedS32>().scale; | ||||
float scale = src_scale / dst_scale; | float scale = src_scale / dst_scale; | ||||
for (size_t i = 0; i < n; ++i) { | for (size_t i = 0; i < n; ++i) { | ||||
dptr[i] = | |||||
saturate<int32_t, float>(std::round((sptr[i] - src_zp) * scale), | |||||
-2147483648, 2147483647); | |||||
dptr[i] = saturate<int32_t, float>( | |||||
std::round((sptr[i] - src_zp) * scale), | |||||
static_cast<float>(std::numeric_limits<int32_t>::min()), | |||||
static_cast<float>(std::numeric_limits<int32_t>::max())); | |||||
} | } | ||||
} | } | ||||
@@ -66,8 +66,10 @@ inline dt_quint8 QConverter::convert(const float& src, const uint8_t& zp) { | |||||
template <> | template <> | ||||
inline dt_qint32 QConverter::convert(const float& src) { | inline dt_qint32 QConverter::convert(const float& src) { | ||||
return dt_qint32( | |||||
saturate<int32_t, float>(std::round(src), -2147483648, 2147483647)); | |||||
return dt_qint32(saturate<int32_t, float>( | |||||
std::round(src), | |||||
static_cast<float>(std::numeric_limits<int32_t>::min()), | |||||
static_cast<float>(std::numeric_limits<int32_t>::max()))); | |||||
} | } | ||||
template <> | template <> | ||||
@@ -101,7 +101,8 @@ public: | |||||
ComputingSequence(const std::shared_ptr<ComputingGraph>& graph) | ComputingSequence(const std::shared_ptr<ComputingGraph>& graph) | ||||
: m_owner_graph_refkeep{graph}, | : m_owner_graph_refkeep{graph}, | ||||
m_owner_graph{ComputingGraphImpl::downcast(graph.get())}, | m_owner_graph{ComputingGraphImpl::downcast(graph.get())}, | ||||
m_have_parent_graph{m_owner_graph->m_parent_graph} {} | |||||
m_have_parent_graph{ | |||||
static_cast<bool>(m_owner_graph->m_parent_graph)} {} | |||||
GraphExecutable::ExecEnv& exec_env() { return m_exec_env; } | GraphExecutable::ExecEnv& exec_env() { return m_exec_env; } | ||||
@@ -371,9 +371,11 @@ serialization::IndexDescMaskDump::from_index_desc(const IndexDesc &desc) { | |||||
ret.nr_item = desc.size(); | ret.nr_item = desc.size(); | ||||
for (size_t i = 0; i < desc.size(); ++ i) { | for (size_t i = 0; i < desc.size(); ++ i) { | ||||
auto &&s = desc[i]; | auto &&s = desc[i]; | ||||
ret.items[i] = { | |||||
static_cast<int8_t>(s.axis.get_raw()), | |||||
s.begin.node(), s.end.node(), s.step.node(), s.idx.node()}; | |||||
ret.items[i] = {static_cast<int8_t>(s.axis.get_raw()), | |||||
static_cast<bool>(s.begin.node()), | |||||
static_cast<bool>(s.end.node()), | |||||
static_cast<bool>(s.step.node()), | |||||
static_cast<bool>(s.idx.node())}; | |||||
} | } | ||||
return ret; | return ret; | ||||
} | } | ||||