diff --git a/src/jit/impl/mlir/compiler.cpp b/src/jit/impl/mlir/compiler.cpp index 12271528..5f85fc6d 100644 --- a/src/jit/impl/mlir/compiler.cpp +++ b/src/jit/impl/mlir/compiler.cpp @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -152,6 +153,7 @@ void add_cuda_lowering_pass(mlir::PassManager& manager, { mlir::OpPassManager& opt_pm = manager.nest(); opt_pm.addPass(create_lower_to_gpu_pass()); + opt_pm.addPass(mlir::createLowerToCFGPass()); opt_pm.addPass(mlir::createCanonicalizerPass()); opt_pm.addPass(mlir::createCSEPass()); opt_pm.addPass(mlir::createLoopFusionPass()); diff --git a/src/jit/impl/mlir/executable_cuda.cpp b/src/jit/impl/mlir/executable_cuda.cpp index 4f7c8629..cea26a67 100644 --- a/src/jit/impl/mlir/executable_cuda.cpp +++ b/src/jit/impl/mlir/executable_cuda.cpp @@ -32,6 +32,14 @@ using namespace mgb; using namespace jit; namespace { + +int64_t get_grid_size(int64_t nr_elements, int64_t block_size) { + // unroll three times in the kernel + int64_t a = nr_elements / (block_size * 2); + int64_t b = (nr_elements - 1) / (block_size * 3) + 1; + return std::max(a, b); +} + template void setup_and_launch(const JITExecutor* fusion_opr, CUfunction func, int block_size) { @@ -87,9 +95,18 @@ void setup_and_launch(const JITExecutor* fusion_opr, CUfunction func, const CompNodeEnv& env = CompNodeEnv::from_comp_node(fusion_opr->comp_node()); - int64_t num_block = (nr_elements - 1) / block_size + 1; + int64_t grid_size; + if (nr_elements <= block_size) { + block_size = nr_elements; + grid_size = 1; + } else { + grid_size = get_grid_size(nr_elements, block_size); + } + int64_t nr_threads = grid_size * block_size; params.push_back(&nr_elements); - MGB_CUDA_CU_CHECK(cuLaunchKernel(func, num_block, 1, 1, block_size, 1, 1, 0, + params.push_back(&nr_threads); + + MGB_CUDA_CU_CHECK(cuLaunchKernel(func, grid_size, 1, 1, block_size, 1, 1, 0, env.cuda_env().stream, params.data(), 0)); } diff --git a/src/jit/impl/mlir/ir/lower_to_gpu_pass.cpp b/src/jit/impl/mlir/ir/lower_to_gpu_pass.cpp index 3ab9166e..88d16e4c 100644 --- a/src/jit/impl/mlir/ir/lower_to_gpu_pass.cpp +++ b/src/jit/impl/mlir/ir/lower_to_gpu_pass.cpp @@ -21,11 +21,6 @@ #include "megbrain/jit/mlir/ir/passes.h" #include "megbrain/jit/mlir/ir/utils.h" -#include -#include -#include -#include -#include #include #include #include @@ -39,124 +34,98 @@ using namespace jit; namespace { -mlir::Value get_tid(ConversionPatternRewriter& rewriter, const Location& loc) { - auto thread_idx = rewriter.create( - loc, rewriter.getIndexType(), rewriter.getStringAttr("x")); - auto block_idx = rewriter.create( - loc, rewriter.getIndexType(), rewriter.getStringAttr("x")); - auto group_size = rewriter.create( - loc, rewriter.getIndexType(), rewriter.getStringAttr("x")); - Value index = rewriter.create( - loc, thread_idx, - rewriter.create(loc, block_idx, group_size)); - - return index; -} +using Rewriter = ConversionPatternRewriter; +using Layout = megdnn::TensorLayout; -megdnn::TensorLayout output_layout(gpu::LaunchOp& launch_op) { - auto func_op = launch_op.getParentOfType(); - mgb_assert(func_op, "Unexpexted launch op."); - for (auto block_iter = func_op.rbegin(); block_iter != func_op.rend(); - block_iter++) { - for (auto op_iter = block_iter->rbegin(); op_iter != block_iter->rend(); - op_iter++) { - auto op = llvm::dyn_cast_or_null(&(*op_iter)); - if (op && op.getNumOperands() > 0) { - return mlir_type_to_layout(*(op.operand_type_begin())); - } - } +/* ===================== GpuLoweringHelper ===================== */ + +struct GpuLoweringHelper { + GpuLoweringHelper(scf::ForOp* for_op, Value index, const Layout& dest) + : m_for_op(for_op), m_index(index), m_dest(dest) {} + + void set_insertion_point(OpBuilder& builder) const { + // insert before the last operation (scf.yield) in the loop body + builder.setInsertionPoint(&(m_for_op->getLoopBody().front().back())); } - mgb_throw(MegBrainError, "Unexpexted launch op."); -} -std::vector get_multidim_tid(ConversionPatternRewriter& rewriter, - const Location& loc, - const mlir::Value& val, - const megdnn::TensorLayout& dst) { - Value index = get_tid(rewriter, loc); - - auto type = val.getType().dyn_cast_or_null(); - if (type) { - ValueBuilderHelper helper(rewriter, loc); - std::vector idxs; - idxs.resize(dst.ndim); - mlir::Value dim_index = index; - for (int i = dst.ndim - 1; i >= 0; i--) { - auto cur_index = helper.modI(dim_index, helper.const_i32(dst[i])); - idxs[i] = cur_index; - dim_index = helper.divI(dim_index, helper.const_i32(dst[i])); + std::vector map_indices(OpBuilder& builder, Location loc, + Value value) const { + auto type = value.getType().dyn_cast_or_null(); + if (!type) { + return {m_index}; } - megdnn::TensorLayout src_layout = mlir_type_to_layout(type); + std::vector indices(m_dest.ndim); + ValueBuilderHelper helper(builder, loc); + + // map global index to multi-dimensional indices + Value dim_index = m_index; + for (int i = m_dest.ndim - 1; i >= 0; i--) { + indices[i] = helper.modI(dim_index, helper.const_i32(m_dest[i])); + dim_index = helper.divI(dim_index, helper.const_i32(m_dest[i])); + } + + // allow broadcasting + Layout src_layout = mlir_type_to_layout(type); src_layout.init_contiguous_stride(); for (int i = 0; i < type.getRank(); ++i) { if (src_layout[i] == 1) { - idxs[i] = helper.const_i32(0); + indices[i] = helper.const_i32(0); } } - return idxs; - } else { - return {index}; + return indices; } -} -struct ElemwiseLowering : public ConversionPattern { - ElemwiseLowering(MLIRContext* ctx, gpu::LaunchOp& launch_op) - : ConversionPattern(dialect::Elemwise::getOperationName(), 1, ctx), - m_launch_op{launch_op} {} +private: + scf::ForOp* m_for_op; + Value m_index; + Layout m_dest; +}; - LogicalResult matchAndRewrite( - Operation* op, ArrayRef operands, - ConversionPatternRewriter& rewriter) const final { - auto loc = op->getLoc(); +/* ===================== conversion patterns ===================== */ - rewriter.setInsertionPointToEnd(&(m_launch_op.body().front())); +struct AssignOpLowering : public ConversionPattern, public GpuLoweringHelper { + AssignOpLowering(MLIRContext* ctx, scf::ForOp* for_op, mlir::Value index, + const Layout& dest) + : ConversionPattern(dialect::AssignOp::getOperationName(), 2, ctx), + GpuLoweringHelper(for_op, index, dest) {} - auto dst_layout = output_layout(m_launch_op); - auto inputs = llvm::to_vector<4>( - llvm::map_range(operands, [&](mlir::Value val) { - auto index = - get_multidim_tid(rewriter, loc, val, dst_layout); - return get_operand(rewriter, loc, val, index); - })); + LogicalResult matchAndRewrite(Operation* op, ArrayRef operands, + Rewriter& rewriter) const final { + auto loc = op->getLoc(); + set_insertion_point(rewriter); - rewriter.replaceOp(op, - lower_elemwise_to_std(op, rewriter, loc, inputs)); + auto index = map_indices(rewriter, loc, operands[1]); + auto input = get_operand(rewriter, loc, operands[0], index); + rewriter.create(loc, input, operands[1], index); + + rewriter.eraseOp(op); return success(); } - -private: - gpu::LaunchOp& m_launch_op; }; -struct TypeCvtLowering : public ConversionPattern { - TypeCvtLowering(MLIRContext* ctx, gpu::LaunchOp& launch_op) - : ConversionPattern(dialect::TypeCvt::getOperationName(), 1, ctx), - m_launch_op{launch_op} {} - LogicalResult matchAndRewrite( - Operation* op, ArrayRef operands, - ConversionPatternRewriter& rewriter) const final { - auto loc = op->getLoc(); - - rewriter.setInsertionPointToEnd(&(m_launch_op.body().front())); - - auto dst_layout = output_layout(m_launch_op); - auto index = get_multidim_tid(rewriter, loc, operands[0], dst_layout); - auto input = get_operand(rewriter, loc, operands[0], index); +struct ConstantScalarOpLowering + : public OpRewritePattern, + public GpuLoweringHelper { + ConstantScalarOpLowering(MLIRContext* ctx, scf::ForOp* for_op, Value index, + const Layout& dest) + : OpRewritePattern(ctx), + GpuLoweringHelper(for_op, index, dest) {} - rewriter.replaceOp(op, lower_typecvt_to_std(op, rewriter, loc, input)); + LogicalResult matchAndRewrite(dialect::ConstantScalarOp op, + PatternRewriter& rewriter) const final { + set_insertion_point(rewriter); + rewriter.replaceOpWithNewOp(op, op.value()); return success(); } - -private: - gpu::LaunchOp& m_launch_op; }; -struct DimshuffleLowering : public ConversionPattern { - DimshuffleLowering(MLIRContext* ctx, gpu::LaunchOp& launch_op) +struct DimshuffleLowering : public ConversionPattern, public GpuLoweringHelper { + DimshuffleLowering(MLIRContext* ctx, scf::ForOp* for_op, Value index, + const Layout& dest) : ConversionPattern(dialect::Dimshuffle::getOperationName(), 1, ctx), - m_launch_op{launch_op} {} + GpuLoweringHelper(for_op, index, dest) {} static std::vector get_index_from_pattern( const std::vector& pattern, @@ -172,163 +141,162 @@ struct DimshuffleLowering : public ConversionPattern { return res; } - LogicalResult matchAndRewrite( - Operation* op, ArrayRef operands, - ConversionPatternRewriter& rewriter) const final { + LogicalResult matchAndRewrite(Operation* op, ArrayRef operands, + Rewriter& rewriter) const final { auto loc = op->getLoc(); + set_insertion_point(rewriter); - rewriter.setInsertionPointToEnd(&(m_launch_op.body().front())); - - auto dst_layout = output_layout(m_launch_op); - auto index = get_multidim_tid(rewriter, loc, operands[0], dst_layout); auto pattern = llvm::dyn_cast(op).pattern(); + auto index = map_indices(rewriter, loc, operands[0]); auto shuffled_index = get_index_from_pattern(pattern, index); rewriter.replaceOp(op, get_operand(rewriter, loc, operands[0], shuffled_index)); return success(); } - -private: - gpu::LaunchOp& m_launch_op; }; -struct ReturnOpLowering : public ConversionPattern { - ReturnOpLowering(MLIRContext* ctx, gpu::LaunchOp& launch_op) - : ConversionPattern(dialect::ReturnOp::getOperationName(), 1, ctx), - m_launch_op{launch_op} {} +struct ElemwiseLowering : public ConversionPattern, public GpuLoweringHelper { + ElemwiseLowering(MLIRContext* ctx, scf::ForOp* for_op, Value index, + const Layout& dest) + : ConversionPattern(dialect::Elemwise::getOperationName(), 1, ctx), + GpuLoweringHelper(for_op, index, dest) {} - LogicalResult matchAndRewrite( - Operation* op, ArrayRef, - ConversionPatternRewriter& rewriter) const final { - rewriter.replaceOpWithNewOp(op); + LogicalResult matchAndRewrite(Operation* op, ArrayRef operands, + Rewriter& rewriter) const final { auto loc = op->getLoc(); + set_insertion_point(rewriter); - //! remove the first gpu.terminator - m_launch_op.body().front().front().erase(); - - //! if (tid >= nr_tid) {return;} in the begin of the block - rewriter.setInsertionPointToStart(&(m_launch_op.body().front())); - Block* cond_block = rewriter.getInsertionBlock(); - Block::iterator op_position = rewriter.getInsertionPoint(); - Block* remaining_ops_block = - rewriter.splitBlock(cond_block, op_position); - rewriter.setInsertionPointToEnd(cond_block); - - auto index = get_tid(rewriter, loc); - auto comparison = rewriter.create( - loc, CmpIPredicate::sge, index, - m_launch_op.getParentOfType() - .getArguments() - .back()); - - Block* then_block = - rewriter.splitBlock(cond_block, rewriter.getInsertionPoint()); - rewriter.setInsertionPointToEnd(then_block); - rewriter.create(loc); - - rewriter.setInsertionPointToEnd(cond_block); - rewriter.create( - loc, comparison, then_block, ArrayRef(), - remaining_ops_block, ArrayRef()); - - rewriter.setInsertionPointToEnd(remaining_ops_block); - rewriter.create(loc); + // currently Elemwise handles at most three operands + auto inputs = llvm::to_vector<4>( + llvm::map_range(operands, [&](mlir::Value val) { + auto index = map_indices(rewriter, loc, val); + return get_operand(rewriter, loc, val, index); + })); + rewriter.replaceOp(op, + lower_elemwise_to_std(op, rewriter, loc, inputs)); return success(); } - -private: - gpu::LaunchOp& m_launch_op; }; -struct ConstantScalarOpLowering - : public OpRewritePattern { - ConstantScalarOpLowering(MLIRContext* ctx, gpu::LaunchOp& launch_op) - : OpRewritePattern(ctx), - m_launch_op{launch_op} {} - - LogicalResult matchAndRewrite(dialect::ConstantScalarOp op, - PatternRewriter& rewriter) const final { - dialect::ConstantScalarOpAdaptor constant_scalar_adaptor(op); - rewriter.setInsertionPointToEnd(&(m_launch_op.body().front())); +struct ReturnOpLowering : public ConversionPattern { + ReturnOpLowering(MLIRContext* ctx, scf::ForOp*, Value, const Layout&) + : ConversionPattern(dialect::ReturnOp::getOperationName(), 1, ctx) { + } - rewriter.replaceOpWithNewOp( - op, constant_scalar_adaptor.value()); + LogicalResult matchAndRewrite(Operation* op, ArrayRef, + Rewriter& rewriter) const final { + rewriter.replaceOpWithNewOp(op); return success(); } - -private: - gpu::LaunchOp& m_launch_op; }; -struct AssignOpLowering : public ConversionPattern { - AssignOpLowering(MLIRContext* ctx, gpu::LaunchOp& launch_op) - : ConversionPattern(dialect::AssignOp::getOperationName(), 2, ctx), - m_launch_op{launch_op} {} +struct TypeCvtLowering : public ConversionPattern, public GpuLoweringHelper { + TypeCvtLowering(MLIRContext* ctx, scf::ForOp* for_op, Value index, + const Layout& dest) + : ConversionPattern(dialect::TypeCvt::getOperationName(), 1, ctx), + GpuLoweringHelper(for_op, index, dest) {} - LogicalResult matchAndRewrite( - Operation* op, ArrayRef operands, - ConversionPatternRewriter& rewriter) const final { + LogicalResult matchAndRewrite(Operation* op, ArrayRef operands, + Rewriter& rewriter) const final { auto loc = op->getLoc(); + set_insertion_point(rewriter); - dialect::AssignOpAdaptor assign_adaptor(operands); - rewriter.setInsertionPointToEnd(&(m_launch_op.body().front())); - - auto dst_layout = output_layout(m_launch_op); - auto index = get_multidim_tid(rewriter, loc, assign_adaptor.rhs(), - dst_layout); - - auto loaded_lhs = - get_operand(rewriter, loc, assign_adaptor.lhs(), index); - rewriter.create(loc, loaded_lhs, assign_adaptor.rhs(), index); + auto index = map_indices(rewriter, loc, operands[0]); + auto input = get_operand(rewriter, loc, operands[0], index); - rewriter.eraseOp(op); + rewriter.replaceOp(op, lower_typecvt_to_std(op, rewriter, loc, input)); return success(); } - -private: - gpu::LaunchOp& m_launch_op; }; +/* ===================== MgbToGpuLoweringPass ===================== */ + class MgbToGpuLoweringPass : public PassWrapper { public: - void getDependentDialects(mlir::DialectRegistry& registry) const override { - registry.insert(); - registry.insert(); - } + void getDependentDialects(DialectRegistry& registry) const override; + void runOnFunction() final; - void runOnFunction() override final { - auto func_op = getFunction(); - Location loc = func_op.getLoc(); - OpBuilder builder(&func_op.getBody()); - Value constantOne = builder.create(loc, 1); - gpu::LaunchOp launch_op = builder.create( - loc, constantOne, constantOne, constantOne, constantOne, - constantOne, constantOne); - builder.setInsertionPointToEnd(&(launch_op.body().front())); - builder.create(loc); - - OwningRewritePatternList patterns; - ConversionTarget target(getContext()); - target.addLegalDialect(); - target.addLegalDialect(); - target.addIllegalDialect(); - - patterns.insert(&getContext(), launch_op); - - if (failed(applyPartialConversion(func_op, target, - std::move(patterns)))) { - signalPassFailure(); - } - } +private: + Value get_idx(OpBuilder& builder, Location loc); + Layout get_dest_layout(FuncOp func_op); }; +void MgbToGpuLoweringPass::getDependentDialects( + DialectRegistry& registry) const { + registry.insert(); +} + +void MgbToGpuLoweringPass::runOnFunction() { + FuncOp func_op = getFunction(); + Location loc = func_op.getLoc(); + OpBuilder builder(func_op.getBody()); + + // create gpu::LaunchOp + Value one = builder.create(loc, 1); + gpu::LaunchOp launch_op = + builder.create(loc, one, one, one, one, one, one); + builder.setInsertionPointToEnd(&(launch_op.body().front())); + + // create scf::ForOp + auto it = func_op.getArguments().end(); + Value nr_threads = *(--it); + Value nr_elements = *(--it); + Value idx = get_idx(builder, loc); + auto for_op = builder.create(loc, idx, nr_elements, nr_threads); + + builder.create(loc); + + Layout dest = get_dest_layout(func_op); + Value for_idx = for_op.getLoopBody().getArgument(0); + + OwningRewritePatternList patterns; + patterns.insert(&getContext(), &for_op, for_idx, dest); + + ConversionTarget target(getContext()); + target.addLegalDialect(); + target.addIllegalDialect(); + + if (failed(applyPartialConversion(func_op, target, std::move(patterns)))) { + signalPassFailure(); + } +} + +//! block_dim * block_idx + thread_idx +Value MgbToGpuLoweringPass::get_idx(OpBuilder& builder, Location loc) { + IndexType idx_type = builder.getIndexType(); + StringAttr x = builder.getStringAttr("x"); + + Value block_dim = builder.create(loc, idx_type, x); + Value block_idx = builder.create(loc, idx_type, x); + Value thread_idx = builder.create(loc, idx_type, x); + + Value prod = builder.create(loc, block_dim, block_idx); + return builder.create(loc, prod, thread_idx); +} + +//! traverse the body of func_op and get dest_layout from AssignOp +Layout MgbToGpuLoweringPass::get_dest_layout(FuncOp func_op) { + Layout dest_layout; + bool found = false; + func_op.walk([&](dialect::AssignOp assign_op) { + dest_layout = mlir_type_to_layout(assign_op.lhs().getType()); + found = true; + return WalkResult::interrupt(); + }); + mgb_assert(found, "AssignOp not found in the body of FuncOp"); + return dest_layout; +} + } // namespace +/* ===================== create_lower_to_gpu_pass ===================== */ + std::unique_ptr mgb::jit::create_lower_to_gpu_pass() { return std::make_unique(); } diff --git a/src/jit/impl/mlir/mlir_gen.cpp b/src/jit/impl/mlir/mlir_gen.cpp index 93b1048b..58205a65 100644 --- a/src/jit/impl/mlir/mlir_gen.cpp +++ b/src/jit/impl/mlir/mlir_gen.cpp @@ -80,7 +80,9 @@ private: for (auto&& arg : args.outputs) { func_args.push_back(get_type(arg.from->layout())); } - //! the last arg is nr_elements + //! nr_elements + func_args.push_back(m_builder.getIndexType()); + //! nr_threads func_args.push_back(m_builder.getIndexType()); auto func_type = m_builder.getFunctionType(func_args, llvm::None);