#pragma once

#include "src/cuda/utils.cuh"

#include <cuda_runtime_api.h>
#include <stdint.h>

namespace megdnn {
namespace cuda {
namespace cumprod {

//! compute conventional sum of elements
template <typename T>
struct ProdOp {
    const T* data;
    typedef ProdOp ContigOp;

    ProdOp(const T* d) : data(d) {}

    __host__ __device__ static T init() { return T(1); }
    __device__ static T apply(T lhs, T rhs) { return lhs * rhs; }
    __device__ T visit(uint32_t idx) const { return data[idx]; }

    static ProdOp make_contig(const T* data) { return ProdOp(data); }
};

/*!
 * \brief cumprod kernel launcher; defined in kern_impl.cuinl
 * \tparam T output data type
 * \tparam Op reduction operator class, which must provide following interface:
 *      typdef ContigOp
 *      static T init(): the identity element
 *      static T apply(T lhs, T rhs): the reduction operation
 *      T visit(uint32_t idx) const: access input
 *      static ContigOp make_contig(const T *data): make an Oo to continue
 *          reduction on temp buffer
 *
 * Note that Op::init() must be accessible from both host and device.
 *
 * In exclusive mode, Op::init() would be filled to the boundary
 *
 * The buffer in *op* and *dst* should not have identical memory addresses.
 */
template <typename T, typename Op, bool exclusive, bool reverse>
void run_kern(
        T* dst, void* workspace, uint32_t workspace_size, uint32_t A, uint32_t B,
        uint32_t C, const Op& op, cudaStream_t stream);

/*!
 * \brief get required workspace size for cumprod, in bytes
 * \param item_size size of item; i.e. sizeof(T) in run_kern
 *
 * Note: cuda device must be set to the computing device before calling this
 * function.
 */
uint32_t get_workspace_in_bytes(uint32_t A, uint32_t B, uint32_t C, uint32_t item_size);

}  // namespace cumprod
}  // namespace cuda
}  // namespace megdnn

// vim: ft=cpp syntax=cpp.doxygen