You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

fp16_help.cuh 2.0 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. /**
  2. * \file dnn/src/cuda/fp16_help.cuh
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #pragma once
  12. #include <cuda_runtime_api.h>
  13. #include "cuda.h"
  14. #include "cuda_fp16.h"
  15. namespace megdnn {
  16. namespace cuda {
  17. __device__ __forceinline__ float fma(const float a, const float b, const float c) {
  18. return a * b + c;
  19. }
  20. __device__ __forceinline__ float2 fma2(const float2 a, const float2 b, const float2 c) {
  21. return {a.x * b.x + c.x, a.y * b.y + c.y};
  22. }
  23. #if CUDA_VERSION >= 9000
  24. __device__ __forceinline__ __half fma(const __half a, const __half b, const __half c) {
  25. #if __CUDA_ARCH__ >= 530
  26. return __hfma(a, b, c);
  27. #else
  28. return __float2half(__half2float(a) * __half2float(b) + __half2float(c));
  29. #endif
  30. }
  31. __device__ __forceinline__ __half2
  32. fma2(const __half2 a, const __half2 b, const __half2 c) {
  33. #if __CUDA_ARCH__ >= 530
  34. return __hfma2(a, b, c);
  35. #else
  36. return {__float2half(__half2float(a.x) * __half2float(b.x) + __half2float(c.x)),
  37. __float2half(__half2float(a.y) * __half2float(b.y) + __half2float(c.y))};
  38. #endif
  39. }
  40. __device__ __forceinline__ __half2 hadd2(const __half2 a, const __half2 b) {
  41. #if __CUDA_ARCH__ >= 530
  42. return __hadd2(a, b);
  43. #else
  44. return {__float2half(__half2float(a.x) + __half2float(b.x)),
  45. __float2half(__half2float(a.y) + __half2float(b.y))};
  46. #endif
  47. }
  48. __device__ __forceinline__ float2
  49. fma2(const __half2 a, const __half2 b, const float2 c) {
  50. return {__half2float(a.x) * __half2float(b.x) + c.x,
  51. __half2float(a.y) * __half2float(b.y) + c.y};
  52. }
  53. #endif // CUDA_VERSION >= 9000
  54. } // namespace cuda
  55. } // namespace megdnn
  56. // vim: syntax=cpp.doxygen