Browse Source

feat(dnn/cuda): generate cutlass kimpls using cmake and bazel

GitOrigin-RevId: da3bcfb85a
release-1.5
Megvii Engine Team huangxinda 4 years ago
parent
commit
4eda338876
100 changed files with 4021 additions and 5192 deletions
  1. +1
    -0
      .gitattributes
  2. +18
    -0
      dnn/scripts/cutlass_generator/BUILD
  3. +19
    -0
      dnn/scripts/cutlass_generator/README.md
  4. +614
    -0
      dnn/scripts/cutlass_generator/conv2d_operation.py
  5. +1085
    -0
      dnn/scripts/cutlass_generator/gemm_operation.py
  6. +38
    -0
      dnn/scripts/cutlass_generator/gen_list.py
  7. +651
    -0
      dnn/scripts/cutlass_generator/generator.py
  8. +27
    -0
      dnn/scripts/cutlass_generator/lazy_file.py
  9. +614
    -0
      dnn/scripts/cutlass_generator/library.py
  10. +578
    -0
      dnn/scripts/cutlass_generator/list.bzl
  11. +351
    -0
      dnn/scripts/cutlass_generator/manifest.py
  12. +25
    -0
      dnn/src/CMakeLists.txt
  13. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu
  14. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu
  15. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu
  16. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu
  17. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu
  18. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu
  19. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu
  20. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu
  21. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu
  22. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu
  23. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu
  24. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu
  25. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu
  26. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu
  27. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu
  28. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu
  29. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu
  30. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu
  31. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu
  32. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu
  33. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu
  34. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu
  35. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu
  36. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu
  37. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu
  38. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu
  39. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu
  40. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu
  41. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu
  42. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu
  43. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu
  44. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu
  45. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu
  46. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu
  47. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu
  48. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu
  49. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu
  50. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu
  51. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu
  52. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu
  53. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu
  54. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu
  55. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu
  56. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu
  57. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu
  58. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu
  59. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu
  60. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu
  61. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu
  62. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu
  63. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu
  64. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu
  65. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu
  66. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu
  67. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu
  68. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu
  69. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu
  70. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu
  71. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu
  72. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu
  73. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu
  74. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu
  75. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu
  76. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu
  77. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu
  78. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu
  79. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu
  80. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu
  81. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu
  82. +0
    -59
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu
  83. +0
    -59
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu
  84. +0
    -59
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu
  85. +0
    -59
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu
  86. +0
    -59
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu
  87. +0
    -59
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu
  88. +0
    -59
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu
  89. +0
    -59
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu
  90. +0
    -59
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu
  91. +0
    -59
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu
  92. +0
    -59
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu
  93. +0
    -59
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu
  94. +0
    -59
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu
  95. +0
    -59
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu
  96. +0
    -59
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu
  97. +0
    -59
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu
  98. +0
    -59
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu
  99. +0
    -59
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu
  100. +0
    -59
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu

+ 1
- 0
.gitattributes View File

@@ -1,5 +1,6 @@
# Mark generated files as binary, ignore them in git diff.
# dnn
dnn/scripts/cutlass_generator/list.bzl binary
dnn/src/cuda/conv_bias/int4/kimpl/* binary
dnn/src/cuda/conv_bias/int8/kimpl/* binary
dnn/src/cuda/conv_bias/int8_imma/kimpl/* binary


+ 18
- 0
dnn/scripts/cutlass_generator/BUILD View File

@@ -0,0 +1,18 @@
load("list.bzl", "cutlass_gen_list")

genrule(
name = "cutlass_kimpls",
outs = cutlass_gen_list,
cmd = """GEN=$(location //brain/megbrain/dnn/scripts/cutlass_generator:generator.py)
pwd > /tmp/a
echo $(@D) > /tmp/b
python3 $$GEN --operations gemm --type simt $(@D)
python3 $$GEN --operations gemv --type simt $(@D)
python3 $$GEN --operations deconv --type simt $(@D)
python3 $$GEN --operations conv2d --type simt $(@D)
python3 $$GEN --operations conv2d --type tensorop8816 $(@D)
python3 $$GEN --operations conv2d --type tensorop8832 $(@D)
""",
tools = ["//brain/megbrain/dnn/scripts/cutlass_generator:generator.py"],
visibility = ["//visibility:public"],
)

+ 19
- 0
dnn/scripts/cutlass_generator/README.md View File

@@ -0,0 +1,19 @@
# Generate device kernel registration code for CUTLASS kernels
## Usage
```bash
python3 generator.py [--operations {gemm, gemv, conv2d, deconv}] [--type {simt, tensorop8816, tensorop8832}]
output
```
- operations: operation kind, including gemm|gemv|conv2d|deconv
- type: opcode class, simt|tensorop8816|tensorop8832
- output: the output directory for CUTLASS kernels

## Generate file list for bazel

We generate `list.bzl` because the `genrule` method of bazel requires that the output file list be specified in the analysis phase.

Please call `gen_list.py` when new operations are added.

```bash
python3 gen_list.py
```

+ 614
- 0
dnn/scripts/cutlass_generator/conv2d_operation.py View File

@@ -0,0 +1,614 @@
#
# \file generator.py
#
# \brief Generates the CUTLASS Library's instances
#
#

import enum
import os.path
import shutil
from typing import Tuple, List

from lazy_file import LazyFile
from library import *

###################################################################################################

#
class Conv2dOperation:
#
def __init__(self, conv_kind, conv_type, arch, tile_description, src, flt, bias, dst, element_epilogue, \
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4, \
need_load_from_const = True, implicit_gemm_mode = ImplicitGemmMode.GemmNt):

self.operation_kind = OperationKind.Conv2d
self.conv_kind = conv_kind
self.arch = arch
self.tile_description = tile_description
self.conv_type = conv_type
self.src = src
self.flt = flt
self.bias = bias
self.dst = dst
self.element_epilogue = element_epilogue
self.epilogue_functor = epilogue_functor
self.swizzling_functor = swizzling_functor
self.need_load_from_const = need_load_from_const
self.implicit_gemm_mode = implicit_gemm_mode
#
def accumulator_type(self):
accum = self.tile_description.math_instruction.element_accumulator

return accum

#
def core_name(self):
''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''

intermediate_type = ''

if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp:
inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
if self.tile_description.math_instruction.element_a != self.flt.element and \
self.tile_description.math_instruction.element_a != self.accumulator_type():
intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
else:
inst_shape = ''

unity_kernel = ''
if not self.need_load_from_const:
unity_kernel = '_1x1'

return "%s%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()], \
inst_shape, intermediate_type, ConvKindNames[self.conv_kind], unity_kernel, \
ShortEpilogueNames[self.epilogue_functor])

#
def extended_name(self):
if self.dst.element != self.tile_description.math_instruction.element_accumulator:
if self.src.element != self.flt.element:
extended_name = "${element_dst}_${core_name}_${element_src}_${element_flt}"
elif self.src.element == self.flt.element:
extended_name = "${element_dst}_${core_name}_${element_src}"
else:
if self.src.element != self.flt.element:
extended_name = "${core_name}_${element_src}_${element_flt}"
elif self.src.element == self.flt.element:
extended_name = "${core_name}_${element_src}"
extended_name = SubstituteTemplate(extended_name, {
'element_src': DataTypeNames[self.src.element],
'element_flt': DataTypeNames[self.flt.element],
'element_dst': DataTypeNames[self.dst.element],
'core_name': self.core_name()
})

return extended_name

#
def layout_name(self):
if self.src.layout == self.dst.layout:
layout_name = "${src_layout}_${flt_layout}"
else:
layout_name = "${src_layout}_${flt_layout}_${dst_layout}"

layout_name = SubstituteTemplate(layout_name, {
'src_layout': ShortLayoutTypeNames[self.src.layout],
'flt_layout': ShortLayoutTypeNames[self.flt.layout],
'dst_layout': ShortLayoutTypeNames[self.dst.layout],
})
return layout_name

#
def configuration_name(self):
''' The full procedural name indicates architecture, extended name, tile size, and layout. '''

opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
warp_shape = [int(self.tile_description.threadblock_shape[idx] / self.tile_description.warp_count[idx]) for idx in range(3)]


threadblock = "%dx%dx%d_%dx%dx%d_%d" % (
self.tile_description.threadblock_shape[0],
self.tile_description.threadblock_shape[1],
self.tile_description.threadblock_shape[2],
warp_shape[0],
warp_shape[1],
warp_shape[2],
self.tile_description.stages,
)

configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}"

return SubstituteTemplate(
configuration_name,
{
'opcode_class': opcode_class_name,
'extended_name': self.extended_name(),
'threadblock': threadblock,
'layout': self.layout_name(),
}
)

#
def procedural_name(self):
''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
return self.configuration_name()

###################################################################################################
#
# Emits single instances of a CUTLASS device-wide operator
#
###################################################################################################

class EmitConv2dInstance:
def __init__(self):
self.template = """
// kernel instance "${operation_name}" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
${element_src},
${layout_src},
${element_flt},
${layout_flt},
${element_dst},
${layout_dst},
${element_bias},
${layout_bias},
${element_accumulator},
${conv_type},
${opcode_class},
${arch},
cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
${epilogue_functor}<
${element_dst},
${epilogue_vector_length},
${element_accumulator},
${element_bias},
${element_epilogue}
>,
${swizzling_functor},
${stages},
${alignment_src},
${alignment_filter},
${nonuninity_kernel},
${math_operator},
${implicit_gemm_mode}>;
"""


def emit(self, operation):

warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)]

epilogue_vector_length = int(min(operation.dst.alignment * DataTypeSize[operation.dst.element], 128) / DataTypeSize[operation.dst.element])

values = {
'operation_name': operation.procedural_name(),
'conv_type': ConvTypeTag[operation.conv_type],
'element_src': DataTypeTag[operation.src.element],
'layout_src': LayoutTag[operation.src.layout],
'element_flt': DataTypeTag[operation.flt.element],
'layout_flt': LayoutTag[operation.flt.layout],
'element_dst': DataTypeTag[operation.dst.element],
'layout_dst': LayoutTag[operation.dst.layout],
'element_bias': DataTypeTag[operation.bias.element],
'layout_bias': LayoutTag[operation.bias.layout],
'element_accumulator': DataTypeTag[operation.accumulator_type()],
'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
'arch': "cutlass::arch::Sm%d" % operation.arch,
'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
'warp_shape_m': str(warp_shape[0]),
'warp_shape_n': str(warp_shape[1]),
'warp_shape_k': str(warp_shape[2]),
'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
'epilogue_vector_length': str(epilogue_vector_length),
'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
'stages': str(operation.tile_description.stages),
'alignment_src': str(operation.src.alignment),
'alignment_filter': str(operation.flt.alignment),
'nonuninity_kernel': str(operation.need_load_from_const).lower(),
'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation],
'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode]
}

return SubstituteTemplate(self.template, values)

class EmitDeconvInstance:
def __init__(self):
self.template = """
// kernel instance "${operation_name}" generated by cutlass generator
using Deconvolution =
typename cutlass::conv::device::Deconvolution<
${element_src},
${layout_src},
${element_flt},
${layout_flt},
${element_dst},
${layout_dst},
${element_bias},
${layout_bias},
${element_accumulator},
${opcode_class},
${arch},
cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
${epilogue_functor}<
${element_dst},
${epilogue_vector_length},
${element_accumulator},
${element_bias},
${element_epilogue}
>,
${swizzling_functor},
${stages},
${alignment_src},
${alignment_filter},
${nonuninity_kernel},
${math_operator},
${implicit_gemm_mode}>;
"""


def emit(self, operation):

warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)]

epilogue_vector_length = int(min(operation.dst.alignment * DataTypeSize[operation.dst.element], 128) / DataTypeSize[operation.dst.element])

values = {
'operation_name': operation.procedural_name(),
'element_src': DataTypeTag[operation.src.element],
'layout_src': LayoutTag[operation.src.layout],
'element_flt': DataTypeTag[operation.flt.element],
'layout_flt': LayoutTag[operation.flt.layout],
'element_dst': DataTypeTag[operation.dst.element],
'layout_dst': LayoutTag[operation.dst.layout],
'element_bias': DataTypeTag[operation.bias.element],
'layout_bias': LayoutTag[operation.bias.layout],
'element_accumulator': DataTypeTag[operation.accumulator_type()],
'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
'arch': "cutlass::arch::Sm%d" % operation.arch,
'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
'warp_shape_m': str(warp_shape[0]),
'warp_shape_n': str(warp_shape[1]),
'warp_shape_k': str(warp_shape[2]),
'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
'epilogue_vector_length': str(epilogue_vector_length),
'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
'stages': str(operation.tile_description.stages),
'alignment_src': str(operation.src.alignment),
'alignment_filter': str(operation.flt.alignment),
'nonuninity_kernel': str(operation.need_load_from_const).lower(),
'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation],
'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode]
}

return SubstituteTemplate(self.template, values)


###################################################################################################
#
# Generator functions for all layouts
#
###################################################################################################

#
def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 128, \
skip_unity_kernel = False, implicit_gemm_mode = ImplicitGemmMode.GemmNt):
operations = []

element_epilogue = DataType.f32
if conv_kind == ConvKind.Fprop:
if src_layout == LayoutType.TensorNHWC:
swizzling_functor = SwizzlingFunctor.ConvFpropNHWC
else:
swizzling_functor = SwizzlingFunctor.ConvFpropNCxHWx
else:
swizzling_functor = SwizzlingFunctor.ConvDgradNCxHWx

# skip rule
def filter_tile_with_layout(tile: TileDescription, layout: LayoutType) -> bool:
return layout == LayoutType.TensorNC32HW32 and \
tile.threadblock_shape[0] % 32 != 0
# rule for bias_type and epilogues
def get_bias_type_and_epilogues(tile: TileDescription, \
out_dtype: DataType) -> Tuple[DataType, List[EpilogueFunctor]]:
if tile.math_instruction.element_accumulator == DataType.s32 and \
out_dtype != DataType.f32:
bias_type = DataType.s32
if tile.math_instruction.element_b == DataType.u4:
epilogues = [EpilogueFunctor.BiasAddLinearCombinationClamp, EpilogueFunctor.BiasAddLinearCombinationReluClamp]
else:
epilogues = [EpilogueFunctor.BiasAddLinearCombinationClamp, EpilogueFunctor.BiasAddLinearCombinationReluClamp, \
EpilogueFunctor.BiasAddLinearCombinationHSwishClamp]
elif tile.math_instruction.element_accumulator == DataType.f32 or \
out_dtype == DataType.f32:
bias_type = DataType.f32
epilogues = [EpilogueFunctor.BiasAddLinearCombination, EpilogueFunctor.BiasAddLinearCombinationRelu, \
EpilogueFunctor.BiasAddLinearCombinationHSwish]
return bias_type, epilogues

# rule for filter alignment
def get_flt_align(tile: TileDescription) -> int:
nonlocal flt_align
if tile.math_instruction.opcode_class == OpcodeClass.Simt \
and tile.math_instruction.element_accumulator == DataType.s32:
thread_num = tile.warp_count[0] * tile.warp_count[1] * tile.warp_count[2] * 32
flt_block = tile.threadblock_shape[0] * tile.threadblock_shape[2] \
* DataTypeSize[tile.math_instruction.element_a]
load_per_thread = flt_block//thread_num
if load_per_thread >= 128:
flt_align = 128
elif load_per_thread >= 64:
flt_align = 64
else:
assert load_per_thread >= 32
flt_align = 32
return flt_align

def get_dst_align(tile: TileDescription, out_layout: LayoutType) -> int:
nonlocal dst_align
if tile.math_instruction.opcode_class == OpcodeClass.TensorOp \
and dst_layout == LayoutType.TensorNC4HW4:
dst_align = 32
return dst_align

def filter_epilogue_with_conv_kind(epilogue: EpilogueFunctor, conv_kind: ConvKind) -> bool:
return conv_kind == ConvKind.Dgrad \
and epilogue != EpilogueFunctor.BiasAddLinearCombinationClamp

# loop over all tile descriptions
for tile in tile_descriptions:
if filter_tile_with_layout(tile, dst_layout):
continue

bias_type, epilogues = get_bias_type_and_epilogues(tile, dst_type)

flt_align = get_flt_align(tile)

dst_align = get_dst_align(tile, dst_layout)

for epilogue in epilogues:
if filter_epilogue_with_conv_kind(epilogue, conv_kind):
continue

if dst_type == DataType.f32:
bias_type = DataType.f32
#
src = TensorDescription(tile.math_instruction.element_b, src_layout, int(src_align / DataTypeSize[tile.math_instruction.element_b]))
flt = TensorDescription(tile.math_instruction.element_a, flt_layout, int(flt_align / DataTypeSize[tile.math_instruction.element_a]))
bias = TensorDescription(bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type])))
dst = TensorDescription(dst_type, dst_layout, int(dst_align / DataTypeSize[dst_type]))

new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, True, implicit_gemm_mode)
operations.append(new_operation)
if not skip_unity_kernel:
new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, False, implicit_gemm_mode)
operations.append(new_operation)
return operations

###################################################################################################
#
# Emitters functions for all targets
#
###################################################################################################

class EmitConv2dConfigurationLibrary:
def __init__(self, operation_path, configuration_name):
self.configuration_name = configuration_name
self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name)

self.instance_emitter = EmitConv2dInstance()

self.instance_template = """
${operation_instance}

// Derived class
struct ${operation_name} :
public ${operation_name}_base { };

///////////////////////////////////////////////////////////////////////////////////////////////////

"""
self.header_template = """
/*
Generated by conv2d_operation.py - Do not edit.
*/

///////////////////////////////////////////////////////////////////////////////////////////////////

#include "cutlass/cutlass.h"
#include "cutlass/library/library.h"
#include "cutlass/library/manifest.h"

#include "library_internal.h"
#include "conv2d_operation.h"

///////////////////////////////////////////////////////////////////////////////////////////////////
"""

self.configuration_header = """

namespace cutlass {
namespace library {

// Initialize all instances
void initialize_${configuration_name}(Manifest &manifest) {

"""

self.configuration_instance = """
using Operation_${operation_name} = cutlass::conv::device::ImplicitGemmConvolution<
${operation_name}>;

manifest.append(new cutlass::library::Conv2dOperation<
Operation_${operation_name}>(
"${operation_name}"));

"""

self.configuration_epilogue = """
}
"""
self.epilogue_template = """

///////////////////////////////////////////////////////////////////////////////////////////////////

} // namespace library
} // namespace cutlass

///////////////////////////////////////////////////////////////////////////////////////////////////

"""

#
def __enter__(self):
self.configuration_file = open(self.configuration_path, "w")
self.configuration_file.write(SubstituteTemplate(self.header_template, {
'configuration_name': self.configuration_name
}))
self.operations = []
return self

#
def emit(self, operation):
self.operations.append(operation)
self.configuration_file.write(SubstituteTemplate(self.instance_template, {
'configuration_name': self.configuration_name,
'operation_name': operation.procedural_name(),
'operation_instance': self.instance_emitter.emit(operation)
}))

#
def __exit__(self, exception_type, exception_value, traceback):

self.configuration_file.write(SubstituteTemplate(self.configuration_header, {
'configuration_name': self.configuration_name
}))

for operation in self.operations:
self.configuration_file.write(SubstituteTemplate(self.configuration_instance, {
'configuration_name': self.configuration_name,
'operation_name': operation.procedural_name()
}))

self.configuration_file.write(self.configuration_epilogue)
self.configuration_file.write(self.epilogue_template)
self.configuration_file.close()

###################################################################################################
###################################################################################################

# Emitters for Conv Kernel Wrapper
#
###################################################################################################

class EmitConvSingleKernelWrapper():
def __init__(self, kernel_path, operation, wrapper_path):
self.kernel_path = kernel_path
self.wrapper_path = wrapper_path
self.operation = operation

self.conv_wrappers = { \
ConvKind.Fprop: """
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
""", \
ConvKind.Dgrad: """
template void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper<Deconvolution>(
const typename Deconvolution::ElementSrc* d_src,
const typename Deconvolution::ElementFilter* d_filter,
const typename Deconvolution::ElementBias* d_bias,
const typename Deconvolution::ElementDst* d_z,
typename Deconvolution::ElementDst* d_dst,
int* workspace,
typename Deconvolution::ConvolutionParameter const& conv_param,
typename Deconvolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
""", \
}
if self.operation.conv_kind == ConvKind.Fprop:
self.instance_emitter = EmitConv2dInstance()
else:
assert self.operation.conv_kind == ConvKind.Dgrad
self.instance_emitter = EmitDeconvInstance()

self.header_template = """
#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "${wrapper_path}"
"""
self.instance_template = """
${operation_instance}
"""
self.wrapper_template = """
${wrapper_instance}
"""

self.epilogue_template = """
#pragma GCC diagnostic pop
#endif
"""

#
def __enter__(self):
self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % self.operation.procedural_name())
self.kernel_file = LazyFile(self.kernel_path)
self.kernel_file.write(SubstituteTemplate(self.header_template, {
'wrapper_path': self.wrapper_path,
}))
return self

#
def emit(self):
self.kernel_file.write(SubstituteTemplate(self.instance_template, {
'operation_instance': self.instance_emitter.emit(self.operation),
}))

# emit wrapper
wrapper = SubstituteTemplate(self.wrapper_template, {
'wrapper_instance': self.conv_wrappers[self.operation.conv_kind],
})
self.kernel_file.write(wrapper)

#
def __exit__(self, exception_type, exception_value, traceback):
self.kernel_file.write(self.epilogue_template)
self.kernel_file.close()


###################################################################################################
###################################################################################################


+ 1085
- 0
dnn/scripts/cutlass_generator/gemm_operation.py
File diff suppressed because it is too large
View File


+ 38
- 0
dnn/scripts/cutlass_generator/gen_list.py View File

@@ -0,0 +1,38 @@
from generator import (
GenerateGemmOperations,
GenerateGemvOperations,
GenerateConv2dOperations,
GenerateDeconvOperations,
)


class GenArg:
def __init__(self, gen_op, gen_type):
self.operations = gen_op
self.type = gen_type


def write_op_list(f, gen_op, gen_type):
if gen_op == "gemm":
operations = GenerateGemmOperations(GenArg(gen_op, gen_type))
elif gen_op == "gemv":
operations = GenerateGemvOperations(GenArg(gen_op, gen_type))
elif gen_op == "conv2d":
operations = GenerateConv2dOperations(GenArg(gen_op, gen_type))
elif gen_op == "deconv":
operations = GenerateDeconvOperations(GenArg(gen_op, gen_type))
for op in operations:
f.write(' "%s.cu",\n' % op.procedural_name())


if __name__ == "__main__":
with open("list.bzl", "w") as f:
f.write("# Generated by dnn/scripts/cutlass_generator/gen_list.py\n\n")
f.write("cutlass_gen_list = [\n")
write_op_list(f, "gemm", "simt")
write_op_list(f, "gemv", "simt")
write_op_list(f, "deconv", "simt")
write_op_list(f, "conv2d", "simt")
write_op_list(f, "conv2d", "tensorop8816")
write_op_list(f, "conv2d", "tensorop8832")
f.write("]")

+ 651
- 0
dnn/scripts/cutlass_generator/generator.py View File

@@ -0,0 +1,651 @@
#
# \file generator.py
#
# \brief Generates the CUTLASS Library's instances
#

import enum
import os.path
import shutil
import argparse

from library import *
from manifest import *
###################################################################################################

#
def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0):

# by default, use the latest CUDA Toolkit version
cuda_version = [11, 0, 132]

# Update cuda_version based on parsed string
if semantic_ver_string != '':
for i, x in enumerate([int(x) for x in semantic_ver_string.split('.')]):
if i < len(cuda_version):
cuda_version[i] = x
else:
cuda_version.append(x)
return cuda_version >= [major, minor, patch]


###################################################################################################
###################################################################################################

#
def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \
alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
swizzling_functor = SwizzlingFunctor.Identity8):

if complex_transforms is None:
complex_transforms = [(ComplexTransform.none, ComplexTransform.none),]

element_a, element_b, element_c, element_epilogue = data_type
operations = []

# by default, only generate the largest tile and largest alignment
if manifest.args.kernels == '':
tile_descriptions = [tile_descriptions[0],]
alignment_constraints = [alignment_constraints[0],]

for layout in layouts:
for tile_description in tile_descriptions:
for alignment in alignment_constraints:
for complex_transform in complex_transforms:
alignment_c = min(8, alignment)
A = TensorDescription(element_a, layout[0], alignment, complex_transform[0])
B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
C = TensorDescription(element_c, layout[2], alignment_c)

new_operation = GemmOperation(GemmKind.Universal, tile_description.minimum_compute_capability, \
tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor)

manifest.append(new_operation)
operations.append(new_operation)

return operations

###########################################################################################################
# ConvolutionOperator support variations
# ____________________________________________________________________
# ConvolutionalOperator | Analytic | Optimized
# ____________________________________________________________________
# | Fprop | (strided) | (strided)
# | Dgrad | (strided, unity*) | (unity)
# | Wgrad | (strided) | (strided)
# ____________________________________________________________________
#
# Note : Operator marked (*) are supported but not generated to keep the instantiated kernel count low
###########################################################################################################
# Convolution for 2D operations
def CreateConv2dOperator(manifest, layout, tile_descriptions, data_type, alignment, \
conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], epilogue_functor = EpilogueFunctor.LinearCombination):
element_a, element_b, element_c, element_epilogue = data_type
# one exceptional case
alignment_c = min(8, alignment)
# iterator algorithm (analytic and optimized)
iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]

# by default, only generate the largest tile size
if manifest.args.kernels == '':
tile_descriptions = [tile_descriptions[0],]

operations = []

for tile in tile_descriptions:
for conv_kind in conv_kinds:
for iterator_algorithm in iterator_algorithms:
A = TensorDescription(element_a, layout[0], alignment)
B = TensorDescription(element_b, layout[1], alignment)
C = TensorDescription(element_c, layout[2], alignment_c)

# unity stride only for Optimized Dgrad
if (iterator_algorithm == IteratorAlgorithm.Optimized) and (conv_kind == ConvKind.Dgrad):
new_operation = Conv2dOperation(conv_kind, iterator_algorithm, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor)

manifest.append(new_operation)
operations.append(new_operation)

# strided dgrad is not supported by Optimized Dgrad
if (iterator_algorithm == IteratorAlgorithm.Optimized) and (conv_kind == ConvKind.Dgrad):
continue

# strided support for Fprop (Analytic/Optimized), Dgrad (Analytic), and Wgrad (Analytic)
new_operation = Conv2dOperation(conv_kind, iterator_algorithm, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor)

manifest.append(new_operation)
operations.append(new_operation)

return operations

###################################################################################################
###################################################################################################

def GenerateConv2d_Simt(args):
operations = []

layouts = [
(LayoutType.TensorNC4HW4, LayoutType.TensorC4RSK4),
]
math_instructions = [
MathInstruction( \
[1, 1, 4], \
DataType.s8, DataType.s8, DataType.s32, \
OpcodeClass.Simt, \
MathOperation.multiply_add),
]

dst_layouts = [
LayoutType.TensorNC4HW4,
LayoutType.TensorNC32HW32,
LayoutType.TensorNHWC,
LayoutType.TensorNHWC,
LayoutType.TensorNCHW
]

dst_types = [
DataType.s8,
DataType.s8,
DataType.u4,
DataType.s4,
DataType.f32,
]

max_cc = 1024

for math_inst in math_instructions:
for layout in layouts:
for dst_type, dst_layout in zip(dst_types, dst_layouts):
if dst_type == DataType.s4 or dst_type == DataType.u4:
min_cc = 75
skip_unity_kernel = True
else:
min_cc = 61
skip_unity_kernel = False
tile_descriptions = [
TileDescription([128, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 64, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 32, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 32, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc),
]
operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1],
dst_layout, dst_type, min_cc, 32, 32, 32,
skip_unity_kernel)
return operations


def GenerateConv2d_TensorOp_8816(args):
operations = []

layouts = [
(LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32),
]
math_instructions = [
MathInstruction( \
[8, 8, 16], \
DataType.s8, DataType.s8, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate),
]

dst_layouts = [
LayoutType.TensorNC32HW32,
LayoutType.TensorNC4HW4,
]

dst_types = [
DataType.s8,
DataType.s8,
]

min_cc = 75
max_cc = 1024

for math_inst in math_instructions:
for layout in layouts:
for dst_type, dst_layout in zip(dst_types, dst_layouts):
if dst_layout == LayoutType.TensorNC32HW32:
tile_descriptions = [
TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 64, 64], 2, [1, 4, 1], math_inst, min_cc, max_cc),
]
else:
assert dst_layout == LayoutType.TensorNC4HW4
tile_descriptions = [
TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
]
operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1],
dst_layout, dst_type, min_cc, 128, 128, 64,
False)
return operations

def GenerateConv2d_TensorOp_8832(args):
operations = []

layouts = [
(LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64),
]
math_instructions = [
MathInstruction( \
[8, 8, 32], \
DataType.s4, DataType.s4, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate), \
MathInstruction( \
[8, 8, 32], \
DataType.s4, DataType.u4, DataType.s32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_saturate)
]

dst_layouts = [
LayoutType.TensorNC64HW64,
]

min_cc = 75
max_cc = 1024

for math_inst in math_instructions:
for layout in layouts:
for dst_layout in dst_layouts:
dst_type = math_inst.element_b
tile_descriptions = [
TileDescription([256, 128, 128], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
]
operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1],
dst_layout, dst_type, min_cc, 128, 128, 64,
True)

layouts_nhwc = [
(LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 32),
(LayoutType.TensorNHWC, LayoutType.TensorNC16HW16, 64),
(LayoutType.TensorNHWC, LayoutType.TensorNC32HW32, 128),
]

dst_layouts_nhwc = [
LayoutType.TensorNHWC,
]

for math_inst in math_instructions:
for layout in layouts_nhwc:
for dst_layout in dst_layouts_nhwc:
dst_type = math_inst.element_b
tile_descriptions = [
TileDescription([128, 32, 64], 2, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 2, [2, 1, 1], math_inst, min_cc, max_cc),
]
operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1],
dst_layout, dst_type, min_cc, layout[2], layout[2], 32,
False, ImplicitGemmMode.GemmTn)
return operations

def GenerateDeconv_Simt(args):
operations = []

layouts = [
(LayoutType.TensorNC4HW4, LayoutType.TensorK4RSC4),
]
math_instructions = [
MathInstruction( \
[1, 1, 4], \
DataType.s8, DataType.s8, DataType.s32, \
OpcodeClass.Simt, \
MathOperation.multiply_add),
]

dst_layouts = [
LayoutType.TensorNC4HW4,
]

dst_types = [
DataType.s8,
]

min_cc = 61
max_cc = 1024

for math_inst in math_instructions:
for layout in layouts:
for dst_type, dst_layout in zip(dst_types, dst_layouts):
tile_descriptions = [
TileDescription([64, 128, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 128, 16], 2, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc),
TileDescription([16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc),
]
operations += GenerateConv2d(ConvKind.Dgrad, tile_descriptions, layout[0], layout[1],
dst_layout, dst_type, min_cc, 32, 32, 32,
True)
return operations

################################################################################
# parameters
# Edge - for tiles, the edges represent the length of one side
# Ratio - the maximum ratio between 2 edges, limits the skinnyness of tiles
# MaxEdge - maximum length of each edge
# Min/Max - minimum/maximum of the product of edge lengths
################################################################################

warpsPerThreadblockEdge = [1, 2, 4, 8, 16]
warpsPerThreadblockRatio = 2
warpsPerThreadblockMax = 16
# NOTE 1x32 and 2x16 warp tile shapes fail validation for ~10% of cases

warpShapeEdges = [8, 16, 32, 64, 128, 256]
warpShapeRatio = 4
warpShapeMax = 64*64
warpShapeMin = 8*8

threadblockEdgeMax = 256

# char, type bits/elem, max tile, L0 threadblock tiles
precisions = {
"c" : [ "cutlass::complex<float>", 64, 64*128, [ [ 64, 128], [ 64, 32] ] ],
"d" : [ "double", 64, 64*64, [ [ 64, 64], [ 32, 32] ] ],
"h" : [ "cutlass::half_t", 16, 128*256, [ [256, 128], [ 64, 128], [ 64, 32] ] ],
"i" : [ "int", 32, 128*128, [ [128, 64], [ 16, 32] ] ],
"s" : [ "float", 32, 128*128, [ [128, 256], [128, 128], [ 64, 64] ] ],
"z" : [ "cutlass::complex<double>", 128, 64*64, [ [ 32, 64], [ 16, 32] ] ],
}
# L1 will have a single kernel for every unique shape
# L2 will have everything else
def GenerateGemm_Simt(args):
################################################################################
# warps per threadblock
################################################################################
warpsPerThreadblocks = []
for warpsPerThreadblock0 in warpsPerThreadblockEdge:
for warpsPerThreadblock1 in warpsPerThreadblockEdge:
if warpsPerThreadblock0 / warpsPerThreadblock1 <= warpsPerThreadblockRatio \
and warpsPerThreadblock1 / warpsPerThreadblock0 <= warpsPerThreadblockRatio \
and warpsPerThreadblock0 * warpsPerThreadblock1 <= warpsPerThreadblockMax:
warpsPerThreadblocks.append([warpsPerThreadblock0,
warpsPerThreadblock1])
################################################################################
# warp shapes
################################################################################
warpNumThreads = 32
warpShapes = []
for warp0 in warpShapeEdges:
for warp1 in warpShapeEdges:
if warp0 / warp1 <= warpShapeRatio \
and warp1 / warp0 <= warpShapeRatio \
and warp0 * warp1 <= warpShapeMax \
and warp0*warp1 > warpShapeMin:
warpShapes.append([warp0, warp1])
# sgemm
precisionType, precisionBits, threadblockMaxElements, threadblockTilesL0 = precisions["s"]
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.RowMajor), # nn
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.RowMajor), # nt
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor), # tn
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor), # tt
]
math_instructions = [
MathInstruction( \
[1, 1, 1], \
DataType.f32, DataType.f32, DataType.f32, \
OpcodeClass.Simt, \
MathOperation.multiply_add),
]
min_cc = 50
max_cc = 1024
operations = []
for math_inst in math_instructions:
for layout in layouts:
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
tile_descriptions = [
TileDescription([64, 256, 8], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 256, 8], 2, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 32, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 32, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 32, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 8, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 16, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 16, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc),
]
for warpsPerThreadblock in warpsPerThreadblocks:
for warpShape in warpShapes:
warpThreadsM = 0
if warpShape[0] > warpShape[1]:
warpThreadsM = 8
else:
warpThreadsM = 4
warpThreadsN = warpNumThreads / warpThreadsM
# skip shapes with conflicting rectangularity
# they are unlikely to be fastest
blockG = warpsPerThreadblock[0] > warpsPerThreadblock[1]
blockL = warpsPerThreadblock[0] < warpsPerThreadblock[1]
warpG = warpShape[0] > warpShape[1]
warpL = warpShape[0] < warpShape[1]
blockG2 = warpsPerThreadblock[0] > warpsPerThreadblock[1]*2
blockL2 = warpsPerThreadblock[0]*2 < warpsPerThreadblock[1]
warpG2 = warpShape[0] > warpShape[1]*2
warpL2 = warpShape[0]*2 < warpShape[1]
if blockG2 and warpL: continue
if blockL2 and warpG: continue
if warpG2 and blockL: continue
if warpL2 and blockG: continue
# check threadblock ratios and max
threadblockTile = [warpShape[0]*warpsPerThreadblock[0],
warpShape[1]*warpsPerThreadblock[1]]
if threadblockTile[0] * threadblockTile[1] > threadblockMaxElements: continue
if threadblockTile[0] > threadblockEdgeMax: continue
if threadblockTile[1] > threadblockEdgeMax: continue
totalThreads = warpNumThreads*warpsPerThreadblock[0]*warpsPerThreadblock[1]
# calculate unroll
# ensure that every iteration at least a full load of A,B are done
unrollMin = 8
unrollMin0 = totalThreads // threadblockTile[0]
unrollMin1 = totalThreads // threadblockTile[1]
unroll = max(unrollMin, unrollMin0, unrollMin1)
threadTileM = warpShape[0] // warpThreadsM
threadTileN = warpShape[1] // warpThreadsN
if threadTileM < 2 or threadTileN < 2: continue
if threadTileM*threadTileN*precisionBits > 8*8*32: continue
# epilogue currently only supports N < WarpNumThreads
if threadblockTile[1] < warpNumThreads: continue
# limit smem
smemBitsA = threadblockTile[0]*unroll*2*precisionBits
smemBitsB = threadblockTile[1]*unroll*2*precisionBits
smemKBytes = (smemBitsA+smemBitsB)/8/1024
if (smemKBytes > 48): continue
tile = TileDescription([threadblockTile[0], threadblockTile[1], unroll], \
2, \
[threadblockTile[0]//warpShape[0], threadblockTile[1]//warpShape[1], 1], \
math_inst, min_cc, max_cc)
def filter(t: TileDescription) -> bool:
nonlocal tile
return t.threadblock_shape[0] == tile.threadblock_shape[0] and \
t.threadblock_shape[1] == tile.threadblock_shape[1] and \
t.threadblock_shape[2] == tile.threadblock_shape[2] and \
t.warp_count[0] == tile.warp_count[0] and \
t.warp_count[1] == tile.warp_count[1] and \
t.warp_count[2] == tile.warp_count[2] and \
t.stages == tile.stages
if not any(t for t in tile_descriptions if filter(t)): continue

operations += GeneratesGemm(tile, data_type, layout[0], layout[1], layout[2], min_cc)
return operations

#
def GenerateGemv_Simt(args):
threadBlockShape_N = [128, 64, 32]
ldgBits_A = [128, 64, 32]
ldgBits_B = [128, 64, 32]

layouts = [
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor),
]

math_instructions = [
MathInstruction( \
[1, 1, 1], \
DataType.f32, DataType.f32, DataType.f32, \
OpcodeClass.Simt, \
MathOperation.multiply_add),
]
min_cc = 50

operations = []
for math_inst in math_instructions:
for layout in layouts:
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
for threadblock_shape_n in threadBlockShape_N:
for align_a in ldgBits_A:
for align_b in ldgBits_B:
ldg_elements_a = align_a // DataTypeSize[math_inst.element_a]
ldg_elements_b = align_b // DataTypeSize[math_inst.element_b]
threadblock_shape_k = (256 * ldg_elements_a) // (threadblock_shape_n // ldg_elements_b)
threadblock_shape = [1, threadblock_shape_n, threadblock_shape_k]
thread_shape = [1, ldg_elements_b, ldg_elements_a]

operations.append(GeneratesGemv(math_inst, \
threadblock_shape, \
thread_shape, \
data_type, \
layout[0], \
layout[1], \
layout[2], \
min_cc, \
align_a, \
align_b))
return operations

#
def GenerateConv2dOperations(args):
if args.type == "simt":
return GenerateConv2d_Simt(args)
elif args.type == "tensorop8816":
return GenerateConv2d_TensorOp_8816(args)
else:
assert args.type == "tensorop8832", "operation conv2d only support" \
"simt, tensorop8816 and tensorop8832. (got:{})".format(args.type)
return GenerateConv2d_TensorOp_8832(args)

def GenerateDeconvOperations(args):
assert args.type == "simt", "operation deconv only support" \
"simt. (got:{})".format(args.type)
return GenerateDeconv_Simt(args)

def GenerateGemmOperations(args):
assert args.type == "simt", "operation gemm only support" \
"simt. (got:{})".format(args.type)
return GenerateGemm_Simt(args)

def GenerateGemvOperations(args):
assert args.type == "simt", "operation gemv only support" \
"simt. (got:{})".format(args.type)
return GenerateGemv_Simt(args)

###################################################################################################
###################################################################################################

if __name__ == "__main__":

parser = argparse.ArgumentParser(description="Generates device kernel registration code for CUTLASS Kernels")
parser.add_argument("--operations", type=str, choices=['gemm', 'gemv', 'conv2d', 'deconv'],
required=True, help="Specifies the operation to generate (gemm, gemv, conv2d, deconv)")
parser.add_argument("output", type=str, help="output directory for CUTLASS kernel files")
parser.add_argument("--type", type=str, choices=['simt', 'tensorop8816', 'tensorop8832'],
default='simt', help="kernel type of CUTLASS kernel generator")

operation2wrapper_path = {
"gemm": "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl", \
"gemv": "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl", \
"conv2d": "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl", \
"deconv": "src/cuda/convolution/backward_data/implicit_gemm_deconv_cutlass_wrapper.cuinl", \
}

args = parser.parse_args()

wrapper_path = operation2wrapper_path[args.operations]
if args.operations == "gemm":
operations = GenerateGemmOperations(args)
elif args.operations == "gemv":
operations = GenerateGemvOperations(args)
elif args.operations == "conv2d":
operations = GenerateConv2dOperations(args)
elif args.operations == "deconv":
operations = GenerateDeconvOperations(args)

if args.operations == "conv2d" or args.operations == "deconv":
for operation in operations:
with EmitConvSingleKernelWrapper(args.output, operation, wrapper_path) as emitter:
emitter.emit()
elif args.operations == "gemm" or args.operations == "gemv":
for operation in operations:
with EmitGemmSingleKernelWrapper(args.output, operation, wrapper_path) as emitter:
emitter.emit()
#
###################################################################################################

+ 27
- 0
dnn/scripts/cutlass_generator/lazy_file.py View File

@@ -0,0 +1,27 @@
#
# \file lazy_file.py
#
# \brief LazyFile updates the target file only when the content is changed
# in order to avoid generating new cutlass kimpls each time cmake is called
#

import io
import os

class LazyFile:
def __init__(self, filename):
self.filename = filename
self.buffer = io.StringIO()

def write(self, data):
self.buffer.write(str(data))

def close(self):
if os.path.isfile(self.filename):
old_data = open(self.filename).read()
else:
old_data = ""
new_data = self.buffer.getvalue()
if old_data != new_data:
with open(self.filename, "w") as f:
f.write(new_data)

+ 614
- 0
dnn/scripts/cutlass_generator/library.py View File

@@ -0,0 +1,614 @@
#
# \file generator.py
#
# \brief Generates the CUTLASS Library's instances
#

import re

###################################################################################################

import enum

# The following block implements enum.auto() for Python 3.5 variants that don't include it such
# as the default 3.5.2 on Ubuntu 16.04.
#
# https://codereview.stackexchange.com/questions/177309/reimplementing-pythons-enum-auto-for-compatibility

try:
from enum import auto as enum_auto
except ImportError:
__cutlass_library_auto_enum = 0
def enum_auto() -> int:
global __cutlass_library_auto_enum
i = __cutlass_library_auto_enum
__cutlass_library_auto_enum += 1
return i

###################################################################################################

#
class GeneratorTarget(enum.Enum):
Library = enum_auto()
#
GeneratorTargetNames = {
GeneratorTarget.Library: 'library'
}
#

###################################################################################################

#
class DataType(enum.Enum):
b1 = enum_auto()
u4 = enum_auto()
u8 = enum_auto()
u16 = enum_auto()
u32 = enum_auto()
u64 = enum_auto()
s4 = enum_auto()
s8 = enum_auto()
s16 = enum_auto()
s32 = enum_auto()
s64 = enum_auto()
f16 = enum_auto()
bf16 = enum_auto()
f32 = enum_auto()
tf32 = enum_auto()
f64 = enum_auto()
cf16 = enum_auto()
cbf16 = enum_auto()
cf32 = enum_auto()
ctf32 = enum_auto()
cf64 = enum_auto()
cs4 = enum_auto()
cs8 = enum_auto()
cs16 = enum_auto()
cs32 = enum_auto()
cs64 = enum_auto()
cu4 = enum_auto()
cu8 = enum_auto()
cu16 = enum_auto()
cu32 = enum_auto()
cu64 = enum_auto()
invalid = enum_auto()

#
ShortDataTypeNames = {
DataType.s32: 'i',
DataType.f16: 'h',
DataType.f32: 's',
DataType.f64: 'd',
DataType.cf32: 'c',
DataType.cf64: 'z',
}

#
DataTypeNames = {
DataType.b1: "b1",
DataType.u4: "u4",
DataType.u8: "u8",
DataType.u16: "u16",
DataType.u32: "u32",
DataType.u64: "u64",
DataType.s4: "s4",
DataType.s8: "s8",
DataType.s16: "s16",
DataType.s32: "s32",
DataType.s64: "s64",
DataType.f16: "f16",
DataType.bf16: "bf16",
DataType.f32: "f32",
DataType.tf32: "tf32",
DataType.f64: "f64",
DataType.cf16: "cf16",
DataType.cbf16: "cbf16",
DataType.cf32: "cf32",
DataType.ctf32: "ctf32",
DataType.cf64: "cf64",
DataType.cu4: "cu4",
DataType.cu8: "cu8",
DataType.cu16: "cu16",
DataType.cu32: "cu32",
DataType.cu64: "cu64",
DataType.cs4: "cs4",
DataType.cs8: "cs8",
DataType.cs16: "cs16",
DataType.cs32: "cs32",
DataType.cs64: "cs64",
}

DataTypeTag = {
DataType.b1: "cutlass::uint1b_t",
DataType.u4: "cutlass::uint4b_t",
DataType.u8: "uint8_t",
DataType.u16: "uint16_t",
DataType.u32: "uint32_t",
DataType.u64: "uint64_t",
DataType.s4: "cutlass::int4b_t",
DataType.s8: "int8_t",
DataType.s16: "int16_t",
DataType.s32: "int32_t",
DataType.s64: "int64_t",
DataType.f16: "cutlass::half_t",
DataType.bf16: "cutlass::bfloat16_t",
DataType.f32: "float",
DataType.tf32: "cutlass::tfloat32_t",
DataType.f64: "double",
DataType.cf16: "cutlass::complex<cutlass::half_t>",
DataType.cbf16: "cutlass::complex<cutlass::bfloat16_t>",
DataType.cf32: "cutlass::complex<float>",
DataType.ctf32: "cutlass::complex<cutlass::tfloat32_t>",
DataType.cf64: "cutlass::complex<double>",
DataType.cu4: "cutlass::complex<cutlass::uint4b_t>",
DataType.cu8: "cutlass::complex<cutlass::uint8_t>",
DataType.cu16: "cutlass::complex<cutlass::uint16_t>",
DataType.cu32: "cutlass::complex<cutlass::uint32_t>",
DataType.cu64: "cutlass::complex<cutlass::uint64_t>",
DataType.cs4: "cutlass::complex<cutlass::int4b_t>",
DataType.cs8: "cutlass::complex<cutlass::int8_t>",
DataType.cs16: "cutlass::complex<cutlass::int16_t>",
DataType.cs32: "cutlass::complex<cutlass::int32_t>",
DataType.cs64: "cutlass::complex<cutlass::int64_t>",
}

DataTypeSize = {
DataType.b1: 1,
DataType.u4: 4,
DataType.u8: 4,
DataType.u16: 16,
DataType.u32: 32,
DataType.u64: 64,
DataType.s4: 4,
DataType.s8: 8,
DataType.s16: 16,
DataType.s32: 32,
DataType.s64: 64,
DataType.f16: 16,
DataType.bf16: 16,
DataType.f32: 32,
DataType.tf32: 32,
DataType.f64: 64,
DataType.cf16: 32,
DataType.cbf16: 32,
DataType.cf32: 64,
DataType.ctf32: 32,
DataType.cf64: 128,
DataType.cu4: 8,
DataType.cu8: 16,
DataType.cu16: 32,
DataType.cu32: 64,
DataType.cu64: 128,
DataType.cs4: 8,
DataType.cs8: 16,
DataType.cs16: 32,
DataType.cs32: 64,
DataType.cs64: 128,
}

###################################################################################################

#
class ComplexTransform(enum.Enum):
none = enum_auto()
conj = enum_auto()

#
ComplexTransformTag = {
ComplexTransform.none: 'cutlass::ComplexTransform::kNone',
ComplexTransform.conj: 'cutlass::ComplexTransform::kConjugate',
}

#
RealComplexBijection = [
(DataType.f16, DataType.cf16),
(DataType.f32, DataType.cf32),
(DataType.f64, DataType.cf64),
]

#
def is_complex(data_type):
for r, c in RealComplexBijection:
if data_type == c:
return True
return False

#
def get_complex_from_real(real_type):
for r, c in RealComplexBijection:
if real_type == r:
return c
return DataType.invalid

#
def get_real_from_complex(complex_type):
for r, c in RealComplexBijection:
if complex_type == c:
return r
return DataType.invalid

#
class ComplexMultiplyOp(enum.Enum):
multiply_add = enum_auto()
gaussian = enum_auto()

###################################################################################################

#
class MathOperation(enum.Enum):
multiply_add = enum_auto()
multiply_add_saturate = enum_auto()
xor_popc = enum_auto()
multiply_add_fast_bf16 = enum_auto()
multiply_add_fast_f16 = enum_auto()
multiply_add_complex = enum_auto()
multiply_add_complex_gaussian = enum_auto()

#
MathOperationTag = {
MathOperation.multiply_add: 'cutlass::arch::OpMultiplyAdd',
MathOperation.multiply_add_saturate: 'cutlass::arch::OpMultiplyAddSaturate',
MathOperation.xor_popc: 'cutlass::arch::OpXorPopc',
MathOperation.multiply_add_fast_bf16: 'cutlass::arch::OpMultiplyAddFastBF16',
MathOperation.multiply_add_fast_f16: 'cutlass::arch::OpMultiplyAddFastF16',
MathOperation.multiply_add_complex: 'cutlass::arch::OpMultiplyAddComplex',
MathOperation.multiply_add_complex_gaussian: 'cutlass::arch::OpMultiplyAddGaussianComplex',
}

###################################################################################################

#
class LayoutType(enum.Enum):
ColumnMajor = enum_auto()
RowMajor = enum_auto()
ColumnMajorInterleaved2 = enum_auto()
RowMajorInterleaved2 = enum_auto()
ColumnMajorInterleaved32 = enum_auto()
RowMajorInterleaved32 = enum_auto()
ColumnMajorInterleaved64 = enum_auto()
RowMajorInterleaved64 = enum_auto()
TensorNHWC = enum_auto()
TensorNDHWC = enum_auto()
TensorNCHW = enum_auto()
TensorNGHWC = enum_auto()
TensorNC4HW4 = enum_auto()
TensorC4RSK4 = enum_auto()
TensorNC8HW8 = enum_auto()
TensorNC16HW16 = enum_auto()
TensorNC32HW32 = enum_auto()
TensorNC64HW64 = enum_auto()
TensorC32RSK32 = enum_auto()
TensorC64RSK64 = enum_auto()
TensorK4RSC4 = enum_auto()

#
LayoutTag = {
LayoutType.ColumnMajor: 'cutlass::layout::ColumnMajor',
LayoutType.RowMajor: 'cutlass::layout::RowMajor',
LayoutType.ColumnMajorInterleaved2: 'cutlass::layout::ColumnMajorInterleaved<2>',
LayoutType.RowMajorInterleaved2: 'cutlass::layout::RowMajorInterleaved<2>',
LayoutType.ColumnMajorInterleaved32: 'cutlass::layout::ColumnMajorInterleaved<32>',
LayoutType.RowMajorInterleaved32: 'cutlass::layout::RowMajorInterleaved<32>',
LayoutType.ColumnMajorInterleaved64: 'cutlass::layout::ColumnMajorInterleaved<64>',
LayoutType.RowMajorInterleaved64: 'cutlass::layout::RowMajorInterleaved<64>',
LayoutType.TensorNHWC: 'cutlass::layout::TensorNHWC',
LayoutType.TensorNDHWC: 'cutlass::layout::TensorNDHWC',
LayoutType.TensorNCHW: 'cutlass::layout::TensorNCHW',
LayoutType.TensorNGHWC: 'cutlass::layout::TensorNGHWC',
LayoutType.TensorNC4HW4: 'cutlass::layout::TensorNCxHWx<4>',
LayoutType.TensorC4RSK4: 'cutlass::layout::TensorCxRSKx<4>',
LayoutType.TensorNC8HW8: 'cutlass::layout::TensorNCxHWx<8>',
LayoutType.TensorNC16HW16: 'cutlass::layout::TensorNCxHWx<16>',
LayoutType.TensorNC32HW32: 'cutlass::layout::TensorNCxHWx<32>',
LayoutType.TensorC32RSK32: 'cutlass::layout::TensorCxRSKx<32>',
LayoutType.TensorNC64HW64: 'cutlass::layout::TensorNCxHWx<64>',
LayoutType.TensorC64RSK64: 'cutlass::layout::TensorCxRSKx<64>',
LayoutType.TensorK4RSC4: 'cutlass::layout::TensorKxRSCx<4>',
}

#
TransposedLayout = {
LayoutType.ColumnMajor: LayoutType.RowMajor,
LayoutType.RowMajor: LayoutType.ColumnMajor,
LayoutType.ColumnMajorInterleaved2: LayoutType.RowMajorInterleaved2,
LayoutType.RowMajorInterleaved2: LayoutType.ColumnMajorInterleaved2,
LayoutType.ColumnMajorInterleaved32: LayoutType.RowMajorInterleaved32,
LayoutType.RowMajorInterleaved32: LayoutType.ColumnMajorInterleaved32,
LayoutType.ColumnMajorInterleaved64: LayoutType.RowMajorInterleaved64,
LayoutType.RowMajorInterleaved64: LayoutType.ColumnMajorInterleaved64,
LayoutType.TensorNHWC: LayoutType.TensorNHWC
}

#
ShortLayoutTypeNames = {
LayoutType.ColumnMajor: 'n',
LayoutType.ColumnMajorInterleaved32: 'n2',
LayoutType.ColumnMajorInterleaved32: 'n32',
LayoutType.ColumnMajorInterleaved64: 'n64',
LayoutType.RowMajor: 't',
LayoutType.RowMajorInterleaved2: 't2',
LayoutType.RowMajorInterleaved32: 't32',
LayoutType.RowMajorInterleaved64: 't64',
LayoutType.TensorNHWC: 'nhwc',
LayoutType.TensorNDHWC: 'ndhwc',
LayoutType.TensorNCHW: 'nchw',
LayoutType.TensorNGHWC: 'nghwc',
LayoutType.TensorNC4HW4: 'nc4hw4',
LayoutType.TensorC4RSK4: 'c4rsk4',
LayoutType.TensorNC8HW8: 'nc8hw8',
LayoutType.TensorNC16HW16: 'nc16hw16',
LayoutType.TensorNC32HW32: 'nc32hw32',
LayoutType.TensorNC64HW64: 'nc64hw64',
LayoutType.TensorC32RSK32: 'c32rsk32',
LayoutType.TensorC64RSK64: 'c64rsk64',
LayoutType.TensorK4RSC4: 'k4rsc4',
}

#
ShortComplexLayoutNames = {
(LayoutType.ColumnMajor, ComplexTransform.none): 'n',
(LayoutType.ColumnMajor, ComplexTransform.conj): 'c',
(LayoutType.RowMajor, ComplexTransform.none): 't',
(LayoutType.RowMajor, ComplexTransform.conj): 'h'
}

###################################################################################################
#
class OpcodeClass(enum.Enum):
Simt = enum_auto()
TensorOp = enum_auto()
WmmaTensorOp = enum_auto()

OpcodeClassNames = {
OpcodeClass.Simt: 'simt',
OpcodeClass.TensorOp: 'tensorop',
OpcodeClass.WmmaTensorOp: 'wmma_tensorop',
}

OpcodeClassTag = {
OpcodeClass.Simt: 'cutlass::arch::OpClassSimt',
OpcodeClass.TensorOp: 'cutlass::arch::OpClassTensorOp',
OpcodeClass.WmmaTensorOp: 'cutlass::arch::OpClassWmmaTensorOp',
}

###################################################################################################

#
class OperationKind(enum.Enum):
Gemm = enum_auto()
Conv2d = enum_auto()

#
OperationKindNames = {
OperationKind.Gemm: 'gemm'
, OperationKind.Conv2d: 'conv2d'
}

#
class Target(enum.Enum):
library = enum_auto()

ArchitectureNames = {
50: 'maxwell',
60: 'pascal',
61: 'pascal',
70: 'volta',
75: 'turing',
80: 'ampere',
}

###################################################################################################

#
def SubstituteTemplate(template, values):
text = template
changed = True
while changed:
changed = False
for key, value in values.items():
regex = "\\$\\{%s\\}" % key
newtext = re.sub(regex, value, text)
if newtext != text:
changed = True
text = newtext
return text

###################################################################################################

#
class GemmKind(enum.Enum):
Gemm = enum_auto()
Sparse = enum_auto()
Universal = enum_auto()
PlanarComplex = enum_auto()
PlanarComplexArray = enum_auto()
SplitKParallel = enum_auto()
GemvBatchedStrided = enum_auto()

#
GemmKindNames = {
GemmKind.Gemm: "gemm",
GemmKind.Sparse: "spgemm",
GemmKind.Universal: "gemm",
GemmKind.PlanarComplex: "gemm_planar_complex",
GemmKind.PlanarComplexArray: "gemm_planar_complex_array",
GemmKind.SplitKParallel: "gemm_split_k_parallel",
GemmKind.GemvBatchedStrided: "gemv_batched_strided",
}

#
class EpilogueFunctor(enum.Enum):
LinearCombination = enum_auto()
LinearCombinationClamp = enum_auto()
BiasAddLinearCombination = enum_auto()
BiasAddLinearCombinationRelu = enum_auto()
BiasAddLinearCombinationHSwish = enum_auto()
BiasAddLinearCombinationClamp = enum_auto()
BiasAddLinearCombinationReluClamp = enum_auto()
BiasAddLinearCombinationHSwishClamp = enum_auto()


#
EpilogueFunctorTag = {
EpilogueFunctor.LinearCombination: 'cutlass::epilogue::thread::LinearCombination',
EpilogueFunctor.LinearCombinationClamp: 'cutlass::epilogue::thread::LinearCombinationClamp',
EpilogueFunctor.BiasAddLinearCombination: 'cutlass::epilogue::thread::BiasAddLinearCombination',
EpilogueFunctor.BiasAddLinearCombinationRelu: 'cutlass::epilogue::thread::BiasAddLinearCombinationRelu',
EpilogueFunctor.BiasAddLinearCombinationHSwish: 'cutlass::epilogue::thread::BiasAddLinearCombinationHSwish',
EpilogueFunctor.BiasAddLinearCombinationClamp: 'cutlass::epilogue::thread::BiasAddLinearCombinationClamp',
EpilogueFunctor.BiasAddLinearCombinationReluClamp: 'cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp',
EpilogueFunctor.BiasAddLinearCombinationHSwishClamp: 'cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp',
}

#
ShortEpilogueNames = {
EpilogueFunctor.BiasAddLinearCombinationHSwishClamp: 'hswish',
EpilogueFunctor.BiasAddLinearCombinationReluClamp: 'relu',
EpilogueFunctor.BiasAddLinearCombinationClamp: 'identity',
EpilogueFunctor.BiasAddLinearCombinationHSwish: 'hswish',
EpilogueFunctor.BiasAddLinearCombinationRelu: 'relu',
EpilogueFunctor.BiasAddLinearCombination: 'identity',
}






#
class SwizzlingFunctor(enum.Enum):
Identity1 = enum_auto()
Identity2 = enum_auto()
Identity4 = enum_auto()
Identity8 = enum_auto()
ConvFpropNCxHWx = enum_auto()
ConvFpropNHWC = enum_auto()
ConvDgradNCxHWx = enum_auto()

#
SwizzlingFunctorTag = {
SwizzlingFunctor.Identity1: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>',
SwizzlingFunctor.Identity2: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>',
SwizzlingFunctor.Identity4: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>',
SwizzlingFunctor.Identity8: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>',
SwizzlingFunctor.ConvFpropNCxHWx: 'cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle',
SwizzlingFunctor.ConvFpropNHWC: 'cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle',
SwizzlingFunctor.ConvDgradNCxHWx: 'cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle',
}

###################################################################################################

class ConvType(enum.Enum):
Convolution = enum_auto()
BatchConvolution = enum_auto()
Local = enum_auto()
LocalShare = enum_auto()

ConvTypeTag = {
ConvType.Convolution: 'cutlass::conv::ConvType::kConvolution',
ConvType.BatchConvolution: 'cutlass::conv::ConvType::kBatchConvolution',
ConvType.Local: 'cutlass::conv::ConvType::kLocal',
ConvType.LocalShare : 'cutlass::conv::ConvType::kLocalShare',
}

#
class ConvKind(enum.Enum):
Fprop = enum_auto()
Dgrad = enum_auto()
Wgrad = enum_auto()

#
ConvKindTag = {
ConvKind.Fprop: 'cutlass::conv::Operator::kFprop',
ConvKind.Dgrad: 'cutlass::conv::Operator::kDgrad',
ConvKind.Wgrad: 'cutlass::conv::Operator::kWgrad'
}

ConvKindNames = {
ConvKind.Fprop: 'fprop',
ConvKind.Dgrad: 'dgrad',
ConvKind.Wgrad: 'wgrad',
}

#
class IteratorAlgorithm(enum.Enum):
Analytic = enum_auto()
Optimized = enum_auto()

#
IteratorAlgorithmTag = {
IteratorAlgorithm.Analytic: 'cutlass::conv::IteratorAlgorithm::kAnalytic',
IteratorAlgorithm.Optimized: 'cutlass::conv::IteratorAlgorithm::kOptimized',
}

IteratorAlgorithmNames = {
IteratorAlgorithm.Analytic: 'analytic',
IteratorAlgorithm.Optimized: 'optimized',
}

#
class StrideSupport(enum.Enum):
Strided = enum_auto()
Unity = enum_auto()

#
StrideSupportTag = {
StrideSupport.Strided: 'cutlass::conv::StrideSupport::kStrided',
StrideSupport.Unity: 'cutlass::conv::StrideSupport::kUnity',
}

StrideSupportNames = {
StrideSupport.Strided: '',
StrideSupport.Unity: 'unity_stride',
}

class ImplicitGemmMode(enum.Enum):
GemmNt = enum_auto()
GemmTn = enum_auto()

ImplicitGemmModeNames = {
ImplicitGemmMode.GemmNt: 'gemm_nt',
ImplicitGemmMode.GemmTn: 'gemm_tn',
}

ImplicitGemmModeTag = {
ImplicitGemmMode.GemmNt: 'cutlass::conv::ImplicitGemmMode::GEMM_NT',
ImplicitGemmMode.GemmTn: 'cutlass::conv::ImplicitGemmMode::GEMM_TN',
}

###################################################################################################

#
class MathInstruction:
def __init__(self, instruction_shape, element_a, element_b, element_accumulator, opcode_class, math_operation = MathOperation.multiply_add):
self.instruction_shape = instruction_shape
self.element_a = element_a
self.element_b = element_b
self.element_accumulator = element_accumulator
self.opcode_class = opcode_class
self.math_operation = math_operation


#
class TileDescription:

def __init__(self, threadblock_shape, stages, warp_count, math_instruction, min_compute, max_compute):
self.threadblock_shape = threadblock_shape
self.stages = stages
self.warp_count = warp_count
self.math_instruction = math_instruction
self.minimum_compute_capability = min_compute
self.maximum_compute_capability = max_compute

def procedural_name(self):
return "%dx%d_%dx%d" % (self.threadblock_shape[0], self.threadblock_shape[1], self.threadblock_shape[2], self.stages)

#
class TensorDescription:
def __init__(self, element, layout, alignment = 1, complex_transform = ComplexTransform.none):
self.element = element
self.layout = layout
self.alignment = alignment
self.complex_transform = complex_transform

###################################################################################################

+ 578
- 0
dnn/scripts/cutlass_generator/list.bzl View File

@@ -0,0 +1,578 @@
# Generated by dnn/scripts/cutlass_generator/gen_list.py

cutlass_gen_list = [
"cutlass_simt_sgemm_8x32_8x2_nn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1.cu",
"cutlass_simt_sgemm_16x32_8x2_nn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1.cu",
"cutlass_simt_sgemm_16x64_8x2_nn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1.cu",
"cutlass_simt_sgemm_32x32_8x2_nn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1.cu",
"cutlass_simt_sgemm_32x64_8x2_nn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1.cu",
"cutlass_simt_sgemm_64x32_8x2_nn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1.cu",
"cutlass_simt_sgemm_16x128_8x2_nn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1.cu",
"cutlass_simt_sgemm_32x128_8x2_nn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1.cu",
"cutlass_simt_sgemm_64x64_8x2_nn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1.cu",
"cutlass_simt_sgemm_128x32_8x2_nn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1.cu",
"cutlass_simt_sgemm_64x128_8x2_nn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1.cu",
"cutlass_simt_sgemm_128x64_8x2_nn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1.cu",
"cutlass_simt_sgemm_32x256_8x2_nn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1.cu",
"cutlass_simt_sgemm_64x256_8x2_nn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1.cu",
"cutlass_simt_sgemm_128x128_8x2_nn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1.cu",
"cutlass_simt_sgemm_256x32_8x2_nn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1.cu",
"cutlass_simt_sgemm_256x64_8x2_nn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1.cu",
"cutlass_simt_sgemm_8x32_8x2_nt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1.cu",
"cutlass_simt_sgemm_16x32_8x2_nt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1.cu",
"cutlass_simt_sgemm_16x64_8x2_nt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1.cu",
"cutlass_simt_sgemm_32x32_8x2_nt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1.cu",
"cutlass_simt_sgemm_32x64_8x2_nt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1.cu",
"cutlass_simt_sgemm_64x32_8x2_nt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1.cu",
"cutlass_simt_sgemm_16x128_8x2_nt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1.cu",
"cutlass_simt_sgemm_32x128_8x2_nt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1.cu",
"cutlass_simt_sgemm_64x64_8x2_nt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1.cu",
"cutlass_simt_sgemm_128x32_8x2_nt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1.cu",
"cutlass_simt_sgemm_64x128_8x2_nt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1.cu",
"cutlass_simt_sgemm_128x64_8x2_nt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1.cu",
"cutlass_simt_sgemm_32x256_8x2_nt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1.cu",
"cutlass_simt_sgemm_64x256_8x2_nt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1.cu",
"cutlass_simt_sgemm_128x128_8x2_nt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1.cu",
"cutlass_simt_sgemm_256x32_8x2_nt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1.cu",
"cutlass_simt_sgemm_256x64_8x2_nt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1.cu",
"cutlass_simt_sgemm_8x32_8x2_tn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1.cu",
"cutlass_simt_sgemm_16x32_8x2_tn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1.cu",
"cutlass_simt_sgemm_16x64_8x2_tn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1.cu",
"cutlass_simt_sgemm_32x32_8x2_tn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1.cu",
"cutlass_simt_sgemm_32x64_8x2_tn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1.cu",
"cutlass_simt_sgemm_64x32_8x2_tn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1.cu",
"cutlass_simt_sgemm_16x128_8x2_tn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1.cu",
"cutlass_simt_sgemm_32x128_8x2_tn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1.cu",
"cutlass_simt_sgemm_64x64_8x2_tn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1.cu",
"cutlass_simt_sgemm_128x32_8x2_tn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1.cu",
"cutlass_simt_sgemm_64x128_8x2_tn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1.cu",
"cutlass_simt_sgemm_128x64_8x2_tn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1.cu",
"cutlass_simt_sgemm_32x256_8x2_tn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1.cu",
"cutlass_simt_sgemm_64x256_8x2_tn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1.cu",
"cutlass_simt_sgemm_128x128_8x2_tn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1.cu",
"cutlass_simt_sgemm_256x32_8x2_tn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1.cu",
"cutlass_simt_sgemm_256x64_8x2_tn_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1.cu",
"cutlass_simt_sgemm_8x32_8x2_tt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1.cu",
"cutlass_simt_sgemm_16x32_8x2_tt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1.cu",
"cutlass_simt_sgemm_16x64_8x2_tt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1.cu",
"cutlass_simt_sgemm_32x32_8x2_tt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1.cu",
"cutlass_simt_sgemm_32x64_8x2_tt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1.cu",
"cutlass_simt_sgemm_64x32_8x2_tt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1.cu",
"cutlass_simt_sgemm_16x128_8x2_tt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1.cu",
"cutlass_simt_sgemm_32x128_8x2_tt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1.cu",
"cutlass_simt_sgemm_64x64_8x2_tt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1.cu",
"cutlass_simt_sgemm_128x32_8x2_tt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1.cu",
"cutlass_simt_sgemm_64x128_8x2_tt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1.cu",
"cutlass_simt_sgemm_128x64_8x2_tt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1.cu",
"cutlass_simt_sgemm_32x256_8x2_tt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1.cu",
"cutlass_simt_sgemm_64x256_8x2_tt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1.cu",
"cutlass_simt_sgemm_128x128_8x2_tt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1.cu",
"cutlass_simt_sgemm_256x32_8x2_tt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1.cu",
"cutlass_simt_sgemm_256x64_8x2_tt_align1.cu",
"cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1.cu",
"cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4.cu",
"cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2.cu",
"cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1.cu",
"cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4.cu",
"cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2.cu",
"cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1.cu",
"cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4.cu",
"cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2.cu",
"cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1.cu",
"cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4.cu",
"cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2.cu",
"cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1.cu",
"cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4.cu",
"cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2.cu",
"cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1.cu",
"cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4.cu",
"cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2.cu",
"cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1.cu",
"cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4.cu",
"cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2.cu",
"cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1.cu",
"cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4.cu",
"cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2.cu",
"cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1.cu",
"cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu",
"cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu",
"cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu",
"cutlass_simt_s8_idgrad_identity_s8_64x128x32_64x32x32_2_nc4hw4_k4rsc4.cu",
"cutlass_simt_s8_idgrad_identity_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4.cu",
"cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4.cu",
"cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu",
"cutlass_simt_s8_idgrad_identity_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4.cu",
"cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu",
"cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
"cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
"cutlass_simt_f32_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu",
"cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu",
"cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
"cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu",
"cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu",
"cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu",
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu",
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu",
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu",
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu",
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu",
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu",
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu",
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu",
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu",
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu",
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu",
]

+ 351
- 0
dnn/scripts/cutlass_generator/manifest.py View File

@@ -0,0 +1,351 @@
#
# \file generator.py
#
# \brief Generates the CUTLASS Library's instances
#

import enum
import os.path
import shutil

from library import *
from gemm_operation import *
from conv2d_operation import *

###################################################################################################

class EmitOperationKindLibrary:
def __init__(self, generated_path, kind, args):
self.generated_path = generated_path
self.kind = kind
self.args = args

self.emitters = {
OperationKind.Gemm: EmitGemmConfigurationLibrary
, OperationKind.Conv2d: EmitConv2dConfigurationLibrary
}

self.configurations = [];

self.header_template ="""
/*
Generated by manifest.py - Do not edit.
*/

#include "cutlass/cutlass.h"
#include "cutlass/library/library.h"
#include "cutlass/library/manifest.h"

namespace cutlass {
namespace library {

///////////////////////////////////////////////////////////////////////////////////////////////////

"""
self.entry_template = """

//
// Entry point to construct operations
//
void initialize_all_${operation_name}_operations(Manifest &manifest) {
"""
self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n"
self.configuration_template =" initialize_${configuration_name}(manifest);\n"

self.epilogue_template ="""

}

///////////////////////////////////////////////////////////////////////////////////////////////////

} // namespace library
} // namespace cutlass

"""

#
def __enter__(self):
self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind])
os.mkdir(self.operation_path)

self.top_level_path = os.path.join(self.operation_path, "all_%s_operations.cu" % OperationKindNames[self.kind])

self.top_level_file = open(self.top_level_path, "w")
self.top_level_file.write(self.header_template)

self.source_files = [self.top_level_path,]

return self

#
def emit(self, configuration_name, operations):

with self.emitters[self.kind](self.operation_path, configuration_name) as configuration_emitter:
for operation in operations:
configuration_emitter.emit(operation)
self.source_files.append(configuration_emitter.configuration_path)

self.configurations.append(configuration_name)
self.top_level_file.write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} ))

#
def __exit__(self, exception_type, exception_value, traceback):
self.top_level_file.write(SubstituteTemplate(self.entry_template, {'operation_name': OperationKindNames[self.kind]}))

for configuration_name in self.configurations:
self.top_level_file.write(SubstituteTemplate(self.configuration_template, {'configuration_name': configuration_name}))

self.top_level_file.write(self.epilogue_template)
self.top_level_file.close()

###################################################################################################
###################################################################################################

class Options:
def __init__(self):
pass

###################################################################################################

#
class Manifest:

#
def __init__(self, args):
self.operations = {}
self.args = args

architectures = args.architectures.split(';') if len(args.architectures) else ['50',]
self.compute_capabilities = [int(x) for x in architectures]
self.selected_kernels = []
if args.operations == 'all':
self.operations_enabled = []
else:

operations_list = [
OperationKind.Gemm
, OperationKind.Conv2d
]

self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')]

if args.kernels == 'all':
self.kernel_names = []
else:
self.kernel_names = [x for x in args.kernels.split(',') if x != '']

self.ignore_kernel_names = [x for x in args.ignore_kernels.split(',') if x != '']

if args.kernel_filter_file is None:
self.kernel_filter_list = []
else:
self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)


self.operation_count = 0
self.operations_by_name = {}
self.top_level_prologue = '''

#include "cutlass/library/library.h"
#include "cutlass/library/manifest.h"

namespace cutlass {
namespace library {

${prototypes}

void initialize_all(Manifest &manifest) {

'''
self.top_level_reserve = ' manifest.reserve(${operation_count});\n\n'
self.top_level_epilogue = '''
}

} // namespace library
} // namespace cutlass

'''


def get_kernel_filters (self, kernelListFile):
if os.path.isfile(kernelListFile):
with open(kernelListFile, 'r') as fileReader:
lines = [line.rstrip() for line in fileReader if not line.startswith("#")]
lines = [re.compile(line) for line in lines if line]
return lines
else:
return []



def filter_out_kernels(self, kernel_name, kernel_filter_list):

for kernel_filter_re in kernel_filter_list:
if kernel_filter_re.search(kernel_name) is not None:
return True
return False

#
def _filter_string_matches(self, filter_string, haystack):
''' Returns true if all substrings appear in the haystack in order'''
substrings = filter_string.split('*')
for sub in substrings:
idx = haystack.find(sub)
if idx < 0:
return False
haystack = haystack[idx + len(sub):]
return True

#
def filter(self, operation):
''' Filtering operations based on various criteria'''

# filter based on compute capability
enabled = False
for cc in self.compute_capabilities:
if cc >= operation.tile_description.minimum_compute_capability and \
cc <= operation.tile_description.maximum_compute_capability:

enabled = True
break

if not enabled:
return False

if len(self.operations_enabled) and not operation.operation_kind in self.operations_enabled:
return False

# eliminate duplicates
if operation.procedural_name() in self.operations_by_name.keys():
return False

# Filter based on list of valid substrings
if len(self.kernel_names):
name = operation.procedural_name()
enabled = False

# compare against the include list
for name_substr in self.kernel_names:
if self._filter_string_matches(name_substr, name):
enabled = True
break

# compare against the exclude list
for name_substr in self.ignore_kernel_names:
if self._filter_string_matches(name_substr, name):
enabled = False
break
if len(self.kernel_filter_list) > 0:
enabled = False
if self.filter_out_kernels(operation.procedural_name(), self.kernel_filter_list):
enabled = True


# todo: filter based on compute data type
return enabled
#

#
def append(self, operation):
'''
Inserts the operation.

operation_kind -> configuration_name -> []
'''

if self.filter(operation):
self.selected_kernels.append(operation.procedural_name())

self.operations_by_name[operation.procedural_name()] = operation

# add the configuration
configuration_name = operation.configuration_name()

if operation.operation_kind not in self.operations.keys():
self.operations[operation.operation_kind] = {}

if configuration_name not in self.operations[operation.operation_kind].keys():
self.operations[operation.operation_kind][configuration_name] = []

self.operations[operation.operation_kind][configuration_name].append(operation)
self.operation_count += 1
#

#
def emit(self, target = GeneratorTarget.Library):

operation_emitters = {
GeneratorTarget.Library: EmitOperationKindLibrary
}

generated_path = os.path.join(self.args.curr_build_dir, 'generated')

# create generated/
if os.path.exists(generated_path):
shutil.rmtree(generated_path)

os.mkdir(generated_path)

source_files = []

top_level_path = os.path.join(generated_path, 'initialize_all.cpp')
with open(top_level_path, 'w') as top_level_file:

if target == GeneratorTarget.Library:
source_files.append(top_level_path)

prototypes = []
for operation_kind, configurations in self.operations.items():
prototypes.append(SubstituteTemplate(
"void initialize_all_${operation_kind}_operations(Manifest &manifest);",
{'operation_kind': OperationKindNames[operation_kind]}))

top_level_file.write(SubstituteTemplate(self.top_level_prologue,
{'prototypes': "\n".join(prototypes)}))

top_level_file.write(SubstituteTemplate(
self.top_level_reserve, {'operation_count': str(self.operation_count)}))

# for each operation kind, emit initializer for all configurations
for operation_kind, configurations in self.operations.items():
with operation_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter:
for configuration_name, operations in configurations.items():
operation_kind_emitter.emit(configuration_name, operations)

source_files += operation_kind_emitter.source_files

top_level_file.write(SubstituteTemplate(
" initialize_all_${operation_kind}_operations(manifest);\n",
{'operation_kind': OperationKindNames[operation_kind]}))

top_level_file.write(self.top_level_epilogue)

# write the manifest.cmake file containing paths from all targets
manifest_path = os.path.join(generated_path, "manifest.cmake")
with open(manifest_path, "w") as manifest_file:

target_name = 'cutlass_library_objs'

target_text = SubstituteTemplate("""cutlass_target_sources(
${target_name}
BATCH_SOURCES ON
PRIVATE
""", { 'target_name': target_name})

manifest_file.write(target_text)

for source_file in source_files:
manifest_file.write(" %s\n" % str(source_file.replace('\\', '/')))
manifest_file.write(")")
#

###################################################################################################

+ 25
- 0
dnn/src/CMakeLists.txt View File

@@ -113,6 +113,31 @@ if(MGE_WITH_CUDA)
list(APPEND SOURCES ${SOURCES_})

file(GLOB_RECURSE CUSOURCES cuda/*.cu)

set(CUTLASS_GEN_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/../scripts/cutlass_generator/generator.py)
set(CUTLASS_GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/cuda/cutlass/generated)
function(gen_cutlass_kimpl op type)
set(CURRENT_CUTLASS_GEN_DIR ${CUTLASS_GEN_DIR}/${op}_${type})
file(MAKE_DIRECTORY ${CURRENT_CUTLASS_GEN_DIR})
execute_process(
COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${CUTLASS_GEN_SCRIPT} --operations ${op} --type ${type} ${CURRENT_CUTLASS_GEN_DIR}
RESULT_VARIABLE gen_cutlass_result
OUTPUT_FILE ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log
ERROR_FILE ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log
)
if (NOT gen_cutlass_result EQUAL 0)
message(FATAL_ERROR "Error generating library instances. See ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log")
endif()
endfunction()
gen_cutlass_kimpl(gemm simt)
gen_cutlass_kimpl(gemv simt)
gen_cutlass_kimpl(deconv simt)
gen_cutlass_kimpl(conv2d simt)
gen_cutlass_kimpl(conv2d tensorop8816)
gen_cutlass_kimpl(conv2d tensorop8832)
file(GLOB_RECURSE CUTLASS_SOURCES ${CUTLASS_GEN_DIR}/*.cu)
list(APPEND SOURCES ${CUTLASS_SOURCES})

list(APPEND SOURCES ${CUSOURCES})
endif()



+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
cutlass::int4b_t,
cutlass::layout::TensorCxRSKx<64>,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 128, 128>,
cutlass::gemm::GemmShape<64, 64, 128>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
16,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
cutlass::int4b_t,
cutlass::layout::TensorCxRSKx<64>,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<256, 128, 128>,
cutlass::gemm::GemmShape<64, 64, 128>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
16,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
cutlass::int4b_t,
cutlass::layout::TensorCxRSKx<64>,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 128, 128>,
cutlass::gemm::GemmShape<64, 64, 128>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
16,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
cutlass::int4b_t,
cutlass::layout::TensorCxRSKx<64>,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<256, 128, 128>,
cutlass::gemm::GemmShape<64, 64, 128>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
16,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
cutlass::int4b_t,
cutlass::layout::TensorCxRSKx<64>,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 128, 128>,
cutlass::gemm::GemmShape<64, 64, 128>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t,
16,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
cutlass::int4b_t,
cutlass::layout::TensorCxRSKx<64>,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<256, 128, 128>,
cutlass::gemm::GemmShape<64, 64, 128>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t,
16,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNCxHWx<64>,
cutlass::int4b_t,
cutlass::layout::TensorCxRSKx<64>,
cutlass::uint4b_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 128, 128>,
cutlass::gemm::GemmShape<64, 64, 128>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
16,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNCxHWx<64>,
cutlass::int4b_t,
cutlass::layout::TensorCxRSKx<64>,
cutlass::uint4b_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<256, 128, 128>,
cutlass::gemm::GemmShape<64, 64, 128>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
16,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNCxHWx<64>,
cutlass::int4b_t,
cutlass::layout::TensorCxRSKx<64>,
cutlass::uint4b_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 128, 128>,
cutlass::gemm::GemmShape<64, 64, 128>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
16,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNCxHWx<64>,
cutlass::int4b_t,
cutlass::layout::TensorCxRSKx<64>,
cutlass::uint4b_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<256, 128, 128>,
cutlass::gemm::GemmShape<64, 64, 128>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
16,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
float,
cutlass::layout::TensorNCHW,
float,
cutlass::layout::TensorNCHW,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm61,
cutlass::gemm::GemmShape<128, 128, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish<
float,
1,
int32_t,
float,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
false,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
float,
cutlass::layout::TensorNCHW,
float,
cutlass::layout::TensorNCHW,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm61,
cutlass::gemm::GemmShape<128, 32, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish<
float,
1,
int32_t,
float,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
false,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
float,
cutlass::layout::TensorNCHW,
float,
cutlass::layout::TensorNCHW,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm61,
cutlass::gemm::GemmShape<128, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish<
float,
1,
int32_t,
float,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
false,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
float,
cutlass::layout::TensorNCHW,
float,
cutlass::layout::TensorNCHW,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm61,
cutlass::gemm::GemmShape<16, 128, 16>,
cutlass::gemm::GemmShape<16, 128, 16>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish<
float,
1,
int32_t,
float,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
1,
4,
8,
false,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
float,
cutlass::layout::TensorNCHW,
float,
cutlass::layout::TensorNCHW,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm61,
cutlass::gemm::GemmShape<16, 64, 8>,
cutlass::gemm::GemmShape<16, 64, 8>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish<
float,
1,
int32_t,
float,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
4,
false,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
float,
cutlass::layout::TensorNCHW,
float,
cutlass::layout::TensorNCHW,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm61,
cutlass::gemm::GemmShape<32, 128, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish<
float,
1,
int32_t,
float,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
false,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
float,
cutlass::layout::TensorNCHW,
float,
cutlass::layout::TensorNCHW,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm61,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish<
float,
1,
int32_t,
float,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
false,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
float,
cutlass::layout::TensorNCHW,
float,
cutlass::layout::TensorNCHW,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm61,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish<
float,
1,
int32_t,
float,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
false,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
float,
cutlass::layout::TensorNCHW,
float,
cutlass::layout::TensorNCHW,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm61,
cutlass::gemm::GemmShape<64, 128, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish<
float,
1,
int32_t,
float,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
false,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
float,
cutlass::layout::TensorNCHW,
float,
cutlass::layout::TensorNCHW,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm61,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish<
float,
1,
int32_t,
float,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
false,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
float,
cutlass::layout::TensorNCHW,
float,
cutlass::layout::TensorNCHW,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm61,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish<
float,
1,
int32_t,
float,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
false,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
float,
cutlass::layout::TensorNCHW,
float,
cutlass::layout::TensorNCHW,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm61,
cutlass::gemm::GemmShape<128, 128, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombination<
float,
1,
int32_t,
float,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
false,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
float,
cutlass::layout::TensorNCHW,
float,
cutlass::layout::TensorNCHW,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm61,
cutlass::gemm::GemmShape<128, 32, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombination<
float,
1,
int32_t,
float,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
false,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
float,
cutlass::layout::TensorNCHW,
float,
cutlass::layout::TensorNCHW,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm61,
cutlass::gemm::GemmShape<128, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombination<
float,
1,
int32_t,
float,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
false,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
float,
cutlass::layout::TensorNCHW,
float,
cutlass::layout::TensorNCHW,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm61,
cutlass::gemm::GemmShape<16, 128, 16>,
cutlass::gemm::GemmShape<16, 128, 16>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombination<
float,
1,
int32_t,
float,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
1,
4,
8,
false,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
float,
cutlass::layout::TensorNCHW,
float,
cutlass::layout::TensorNCHW,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm61,
cutlass::gemm::GemmShape<16, 64, 8>,
cutlass::gemm::GemmShape<16, 64, 8>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombination<
float,
1,
int32_t,
float,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
4,
false,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
float,
cutlass::layout::TensorNCHW,
float,
cutlass::layout::TensorNCHW,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm61,
cutlass::gemm::GemmShape<32, 128, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombination<
float,
1,
int32_t,
float,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
false,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

+ 0
- 59
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu View File

@@ -1,59 +0,0 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
float,
cutlass::layout::TensorNCHW,
float,
cutlass::layout::TensorNCHW,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm61,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombination<
float,
1,
int32_t,
float,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
false,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::ImplicitGemmMode::GEMM_NT>;



template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);


#pragma GCC diagnostic pop
#endif

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save