fix(build/windows/cuda/llvm): fix windows bazel build with cuda

* Adapt to the new version llvm/clang-11 * fix windows bazel build with cuda * add windows bazel build cuda ci * opt windows bazel ci scripts GitOrigin-RevId: 6ea7c66585
4 years ago · 0398a7867f
--- a/dnn/src/cuda/dct/opr_impl.h
+++ b/dnn/src/cuda/dct/opr_impl.h
@@ -26,7 +26,7 @@ public:
    size_t get_workspace_in_bytes(const TensorLayout& /*src*/,
                                  const TensorLayout& /*mask_offset*/,
                                  const TensorLayout& /*mask_val*/,
                                  const TensorLayout& /*dst*/) {
                                  const TensorLayout& /*dst*/) override {
        return 0;
    };
    void set_error_tracker(void* tracker) override {
--- a/dnn/src/cuda/deformable_ps_roi_pooling/opr_impl.h
+++ b/dnn/src/cuda/deformable_ps_roi_pooling/opr_impl.h
@@ -43,7 +43,7 @@ public:
                                  const TensorLayout& /* out_diff */,
                                  const TensorLayout& /* out_count */,
                                  const TensorLayout& /* data_diff */,
                                  const TensorLayout& /* trans_diff */) {
                                  const TensorLayout& /* trans_diff */) override {
        return 0ULL;
    };
--- a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/preprocess_quantize_sum.cu
+++ b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/preprocess_quantize_sum.cu
@@ -192,13 +192,11 @@ void megdnn::cuda::exec_span_qsum(const int32_t* qSumA, const uint32_t M,
                                  int32_t* dst, const uint32_t strd,
                                  const int32_t scaler_bias,
                                  cudaStream_t stream) {
    constexpr size_t TX = 32, TY = 32;
    constexpr size_t BX = 32, BY = 32;
    constexpr uint32_t TX = 32, TY = 32, BX = 32, BY = 32;
    dim3 nthreads{TX, TY};
    dim3 nblocks{static_cast<uint32_t>(DIVUP(N, BX)),
                 static_cast<uint32_t>(DIVUP(M, BY))};
    span_qsum<TX, TY, BX, BY><<<nblocks, nthreads, 0, stream>>>(qSumA, M, qSumB, N, dst, strd,
                                                scaler_bias);
    dim3 nblocks{DIVUP(N, BX), DIVUP(M, BY)};
    span_qsum<TX, TY, BX, BY><<<nblocks, nthreads, 0, stream>>>(
            qSumA, M, qSumB, N, dst, strd, scaler_bias);
    after_kernel_launch();
 }
--- a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cu
+++ b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cu
@@ -46,10 +46,7 @@ using namespace wmma::experimental::precision;
 namespace wmma_matrix_mul_u4 {
 constexpr size_t WMMA_M = 8;
 constexpr size_t WMMA_N = 8;
 constexpr size_t WMMA_K = 32;
 constexpr size_t WARP_SIZE = 32;
 constexpr uint32_t WMMA_M = 8, WMMA_N = 8, WMMA_K = 32, WARP_SIZE = 32;
 template <size_t WARP_X_, size_t WARP_Y_, size_t ROW_PER_WARP_,
          size_t COL_PER_WARP_>
@@ -334,10 +331,8 @@ __global__ void u4_gemm_template_device_nt(const uint8_t* /*A*/,
 void _do_dispatch_wmma_matrix_mul_u4(const uint8_t* A, const uint8_t* B,
                                     int32_t* C, int M, int N, int K, int lda,
                                     int ldb, int ldc, cudaStream_t stream) {
    constexpr size_t warp_x = 4;
    constexpr size_t warp_y = 4;
    constexpr size_t row_per_warp = 4;
    constexpr size_t col_per_warp = 4;
    constexpr uint32_t warp_x = 4, warp_y = 4, row_per_warp = 4,
                       col_per_warp = 4;
    typedef BlockConfig<warp_x, warp_y, row_per_warp, col_per_warp>
            BlockConfig_;
    dim3 block{warp_x * WARP_SIZE, warp_y};
--- a/dnn/src/fallback/type_cvt/opr_impl.cpp
+++ b/dnn/src/fallback/type_cvt/opr_impl.cpp
@@ -110,8 +110,10 @@ void do_cvt_normal_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) {
    float scale = dst.layout.dtype.param<dtype::QuantizedS32>().scale;
    float dscale = 1.f / scale;
    for (size_t i = 0; i < n; ++i) {
        dptr[i] = saturate<int32_t, float>(std::round(sptr[i] * dscale),
                                           -2147483648, 2147483647);
        dptr[i] = saturate<int32_t, float>(
                std::round(sptr[i] * dscale),
                static_cast<float>(std::numeric_limits<int32_t>::min()),
                static_cast<float>(std::numeric_limits<int32_t>::max()));
    }
 }
@@ -219,8 +221,10 @@ void do_cvt_s8_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) {
    float dst_scale = dst.layout.dtype.param<dtype::QuantizedS32>().scale;
    float scale = src_scale / dst_scale;
    for (size_t i = 0; i < n; ++i) {
        dptr[i] = saturate<int32_t, float>(std::round(sptr[i] * scale),
                                           -2147483648, 2147483647);
        dptr[i] = saturate<int32_t, float>(
                std::round(sptr[i] * scale),
                static_cast<float>(std::numeric_limits<int32_t>::min()),
                static_cast<float>(std::numeric_limits<int32_t>::max()));
    }
 }
@@ -232,8 +236,10 @@ void do_cvt_s32_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) {
    float dst_scale = dst.layout.dtype.param<dtype::QuantizedS32>().scale;
    float scale = src_scale / dst_scale;
    for (size_t i = 0; i < n; ++i) {
        dptr[i] = saturate<int32_t, float>(std::round(sptr[i] * scale),
                                           -2147483648, 2147483647);
        dptr[i] = saturate<int32_t, float>(
                std::round(sptr[i] * scale),
                static_cast<float>(std::numeric_limits<int32_t>::min()),
                static_cast<float>(std::numeric_limits<int32_t>::max()));
    }
 }
@@ -247,9 +253,10 @@ void do_cvt_asymm8_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) {
    float dst_scale = dst.layout.dtype.param<dtype::QuantizedS32>().scale;
    float scale = src_scale / dst_scale;
    for (size_t i = 0; i < n; ++i) {
        dptr[i] =
                saturate<int32_t, float>(std::round((sptr[i] - src_zp) * scale),
                                         -2147483648, 2147483647);
        dptr[i] = saturate<int32_t, float>(
                std::round((sptr[i] - src_zp) * scale),
                static_cast<float>(std::numeric_limits<int32_t>::min()),
                static_cast<float>(std::numeric_limits<int32_t>::max()));
    }
 }
--- a/dnn/src/x86/quantized_converter.h
+++ b/dnn/src/x86/quantized_converter.h
@@ -66,8 +66,10 @@ inline dt_quint8 QConverter::convert(const float& src, const uint8_t& zp) {
 template <>
 inline dt_qint32 QConverter::convert(const float& src) {
    return dt_qint32(
            saturate<int32_t, float>(std::round(src), -2147483648, 2147483647));
    return dt_qint32(saturate<int32_t, float>(
            std::round(src),
            static_cast<float>(std::numeric_limits<int32_t>::min()),
            static_cast<float>(std::numeric_limits<int32_t>::max())));
 }
 template <>
--- a/src/core/impl/graph/cg_impl_seq.h
+++ b/src/core/impl/graph/cg_impl_seq.h
@@ -101,7 +101,8 @@ public:
    ComputingSequence(const std::shared_ptr<ComputingGraph>& graph)
            : m_owner_graph_refkeep{graph},
              m_owner_graph{ComputingGraphImpl::downcast(graph.get())},
              m_have_parent_graph{m_owner_graph->m_parent_graph} {}
              m_have_parent_graph{
                      static_cast<bool>(m_owner_graph->m_parent_graph)} {}
    GraphExecutable::ExecEnv& exec_env() { return m_exec_env; }
--- a/src/opr/impl/internal/indexing_helper.cpp
+++ b/src/opr/impl/internal/indexing_helper.cpp
@@ -371,9 +371,11 @@ serialization::IndexDescMaskDump::from_index_desc(const IndexDesc &desc) {
    ret.nr_item = desc.size();
    for (size_t i = 0; i < desc.size(); ++ i) {
        auto &&s = desc[i];
        ret.items[i] = {
            static_cast<int8_t>(s.axis.get_raw()),
            s.begin.node(), s.end.node(), s.step.node(), s.idx.node()};
        ret.items[i] = {static_cast<int8_t>(s.axis.get_raw()),
                        static_cast<bool>(s.begin.node()),
                        static_cast<bool>(s.end.node()),
                        static_cast<bool>(s.step.node()),
                        static_cast<bool>(s.idx.node())};
    }
    return ret;
 }